Data analysis for:
Dylan J. Padilla Perez, 2023. Geographic variation of the for gene reveals signatures of local adaptation in Drosophila melanogaster. Dylan J. Padilla Perez, School of Life Sciences, Arizona State University, Tempe, AZ 85287, USA.Library
library(adegenet)
library(ade4)
library(car)
library(ComplexHeatmap)
library(data.table)
library(dartR)
library(dplyr)
library(ecodist)
library(ggsn)
library(grid)
library(gridBase)
library(ggplot2)
library(gplots)
library(ggpubr)
library(ggspatial)
library(hierfstat)
library(lattice)
library(LEA)
library(lfmm)
library(MASS)
library(maps)
library(maptools)
library(mapplots)
library(miscTools)
library(naniar)
library(OutFLANK)
library(patchwork)
library(pegas)
library(poppr)
library(qvalue)
library(randomcoloR)
library(raster)
library(reshape)
library(rgeos)
library(rnaturalearth)
library(rnaturalearthdata)
library(rworldxtra)
library(rworldmap)
library(scales)
library(sf)
library(shape)
library(SeqArray)
library(SeqVarTools)
library(SNPRelate)
library(stringr)
library(vcfR)
library(vegan)
Session information
R.version
_
platform x86_64-apple-darwin17.0
arch x86_64
os darwin17.0
system x86_64, darwin17.0
status
major 4
minor 1.1
year 2021
month 08
day 10
svn rev 80725
language R
version.string R version 4.1.1 (2021-08-10)
nickname Kick Things
sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] vegan_2.6-4 permute_0.9-7 vcfR_1.14.0
[4] stringr_1.5.0 SNPRelate_1.28.0 SeqVarTools_1.32.0
[7] SeqArray_1.34.0 gdsfmt_1.30.0 shape_1.4.6
[10] sf_1.0-9 scales_1.2.1 rworldmap_1.3-6
[13] rworldxtra_1.01 rnaturalearthdata_0.1.0 rnaturalearth_0.3.2
[16] rgeos_0.6-1 reshape_0.8.9 raster_3.6-14
[19] randomcoloR_1.1.0.1 poppr_2.9.3 pegas_1.1
[22] ape_5.7 patchwork_1.1.2 OutFLANK_0.2
[25] qvalue_2.26.0 naniar_1.0.0 miscTools_0.6-26
[28] mapplots_1.5.1 maptools_1.1-6 sp_1.6-0
[31] maps_3.4.1 MASS_7.3-58.2 lfmm_1.1
[34] LEA_3.6.0 lattice_0.20-45 hierfstat_0.5-11
[37] ggspatial_1.1.7 ggpubr_0.6.0 gplots_3.1.3
[40] gridBase_0.4-7 ggsn_0.5.0 ecodist_2.0.9
[43] dartR_2.7.2 dartR.data_1.0.2 dplyr_1.1.0
[46] ggplot2_3.4.1 data.table_1.14.6 ComplexHeatmap_2.10.0
[49] car_3.1-1 carData_3.0-5 adegenet_2.1.10
[52] ade4_1.7-22 rmarkdown_2.20
loaded via a namespace (and not attached):
[1] utf8_1.2.3 R.utils_2.12.2 tidyselect_1.2.0
[4] combinat_0.0-8 Rtsne_0.16 StAMPP_1.6.3
[7] GWASExactHW_1.01 munsell_0.5.0 codetools_0.2-19
[10] units_0.8-1 withr_2.5.0 colorspace_2.1-0
[13] progressr_0.13.0 Biobase_2.54.0 highr_0.10
[16] knitr_1.42 stats4_4.1.1 ggsignif_0.6.4
[19] labeling_0.4.2 RgoogleMaps_1.4.5.3 logistf_1.24.1
[22] GenomeInfoDbData_1.2.7 farver_2.1.1 gap.datasets_0.0.5
[25] vctrs_0.5.2 generics_0.1.3 xfun_0.37
[28] GenomeInfoDb_1.30.1 R6_2.5.1 doParallel_1.0.17
[31] clue_0.3-64 fields_14.1 bitops_1.0-7
[34] cachem_1.0.6 promises_1.2.0.1 pinfsc50_1.2.0
[37] gtable_0.3.1 formula.tools_1.7.1 spam_2.9-1
[40] rlang_1.0.6 calibrate_1.7.7 GlobalOptions_0.1.2
[43] splines_4.1.1 rstatix_0.7.2 rgdal_1.6-4
[46] broom_1.0.3 yaml_2.3.7 reshape2_1.4.4
[49] abind_1.4-5 backports_1.4.1 httpuv_1.6.9
[52] tools_4.1.1 ellipsis_0.3.2 jquerylib_0.1.4
[55] RColorBrewer_1.1-3 proxy_0.4-27 BiocGenerics_0.40.0
[58] Rcpp_1.0.10 plyr_1.8.8 zlibbioc_1.40.0
[61] RCurl_1.98-1.10 classInt_0.4-8 purrr_1.0.1
[64] GetoptLong_1.0.5 viridis_0.6.2 S4Vectors_0.32.4
[67] ggmap_3.0.1 cluster_2.1.4 magrittr_2.0.3
[70] RSpectra_0.16-1 genetics_1.3.8.1.3 circlize_0.4.15
[73] mvtnorm_1.1-3 matrixStats_0.63.0 mime_0.12
[76] evaluate_0.20 xtable_1.8-4 jpeg_0.1-10
[79] IRanges_2.28.0 gridExtra_2.3 compiler_4.1.1
[82] mice_3.15.0 tibble_3.1.8 KernSmooth_2.23-20
[85] V8_4.2.2 crayon_1.5.2 gdistance_1.6
[88] R.oo_1.25.0 htmltools_0.5.4 mgcv_1.8-41
[91] later_1.3.0 tidyr_1.3.0 visdat_0.6.0
[94] DBI_1.1.3 PopGenReport_3.0.7 boot_1.3-28.1
[97] Matrix_1.5-1 cli_3.6.0 R.methodsS3_1.8.2
[100] gdata_2.18.0.1 parallel_4.1.1 dotCall64_1.0-2
[103] igraph_1.4.0 GenomicRanges_1.46.1 pkgconfig_2.0.3
[106] foreign_0.8-84 terra_1.7-3 foreach_1.5.2
[109] memuse_4.2-3 bslib_0.4.2 XVector_0.34.0
[112] digest_0.6.31 polysat_1.7-7 Biostrings_2.62.0
[115] operator.tools_1.6.3 curl_5.0.0 gap_1.5-1
[118] shiny_1.7.4 gtools_3.9.4 rjson_0.2.21
[121] lifecycle_1.0.3 nlme_3.1-162 dismo_1.3-9
[124] jsonlite_1.8.4 seqinr_4.2-23 viridisLite_0.4.1
[127] fansi_1.0.4 pillar_1.8.1 GGally_2.1.2
[130] fastmap_1.1.0 httr_1.4.4 glue_1.6.2
[133] mmod_1.3.3 png_0.1-8 iterators_1.0.14
[136] class_7.3-21 stringi_1.7.12 sass_0.4.5
[139] caTools_1.18.2 e1071_1.7-13
Quality control and SNP isolation
## Setting a seed
set.seed(94)
## load meta-data file
samps <- fread("samps_10Nov2020.csv")
## open GDS for common SNPs (PoolSNP)
genofile <- seqOpen("dest.all.PoolSNP.001.50.10Nov2020.ann.gds", allow.duplicate = TRUE)
## common SNP.dt
snp.dt <- data.table(chr = seqGetData(genofile, "chromosome"),
pos = seqGetData(genofile, "position"),
nAlleles = seqGetData(genofile, "$num_allele"),
id = seqGetData(genofile, "variant.id"),
genotype = seqGetData(genofile, "allele"))
snp.dt <- snp.dt[nAlleles == 2]
seqSetFilter(genofile, snp.dt$id)
## # of selected variants: 4,281,164
## filter to target
snp.tmp <- data.table(chr ="2L", pos = 3622086:3656954)
setkey(snp.tmp, chr, pos)
setkey(snp.dt, chr, pos)
seqSetFilter(genofile, variant.id=snp.dt[J(snp.tmp), nomatch = 0]$id)
## # of selected variants: 1,613
## get annotations
message("Annotations")
tmp <- seqGetData(genofile, "annotation/info/ANN")
len1 <- tmp$length
len2 <- tmp$data
snp.dt1 <- data.table(len = rep(len1, times = len1), ann = len2, id = rep(snp.dt[J(snp.tmp), nomatch = 0]$id, times = len1))
## Extract data between the 2nd and third | symbol
snp.dt1[ ,class := tstrsplit(snp.dt1$ann,"\\|")[[2]]]
snp.dt1[ ,gene := tstrsplit(snp.dt1$ann,"\\|")[[4]]]
## Collapse additional annotations to original SNP vector length
snp.dt1.an <- snp.dt1[,list(n = length(class), col = paste(class, collapse = ","), gene = paste(gene, collapse = ",")), list(variant.id = id)]
snp.dt1.an[,col := tstrsplit(snp.dt1.an$col,"\\,")[[1]]]
snp.dt1.an[,gene := tstrsplit(snp.dt1.an$gene,"\\,")[[1]]]
## get frequencies
message("Allele Freqs")
ad <- seqGetData(genofile, "annotation/format/AD")
dp <- seqGetData(genofile, "annotation/format/DP")
af <- data.table(ad = expand.grid(ad$data)[,1],
dp = expand.grid(dp)[,1],
sampleId = rep(seqGetData(genofile, "sample.id"), dim(ad$data)[2]),
variant.id = rep(seqGetData(genofile, "variant.id"), each = dim(ad$data)[1]))
## merge them together
message("merge")
afi <- merge(af, snp.dt1.an, by = "variant.id")
afi <- merge(afi, snp.dt, by.x = "variant.id", by.y = "id")
afi[ , af := ad/dp]
## calculate effective read-depth
afis <- merge(afi, samps, by = "sampleId")
afis[chr == "X", nEff := round((dp*nFlies - 1)/(dp+nFlies))]
afis[chr != "X", nEff := round((dp*2*nFlies - 1)/(dp+2*nFlies))]
afis[ ,af_nEff := round(af*nEff)/nEff]
## subsetting dataset
names(afis)
## [1] "sampleId" "variant.id" "ad" "dp"
## [5] "n" "col" "gene" "chr"
## [9] "pos" "nAlleles" "genotype" "af"
## [13] "locality" "country" "city" "collectionDate"
## [17] "DateExact" "lat" "long" "season"
## [21] "type" "continent" "set" "nFlies"
## [25] "SRA_accession" "SRA_experiment" "Model" "collector"
## [29] "sampleType" "year" "yday" "stationId"
## [33] "dist_km" "In(2L)t" "In(2R)Ns" "In(3L)P"
## [37] "In(3R)C" "In(3R)K" "In(3R)Mo" "In(3R)Payne"
## [41] "status" "bio1" "bio2" "bio3"
## [45] "bio4" "bio5" "bio6" "bio7"
## [49] "bio8" "bio9" "bio10" "bio11"
## [53] "bio12" "bio13" "bio14" "bio15"
## [57] "bio16" "bio17" "bio18" "bio19"
## [61] "tmin1" "tmin2" "tmin3" "tmin4"
## [65] "tmin5" "tmin6" "tmin7" "tmin8"
## [69] "tmin9" "tmin10" "tmin11" "tmin12"
## [73] "tmax1" "tmax2" "tmax3" "tmax4"
## [77] "tmax5" "tmax6" "tmax7" "tmax8"
## [81] "tmax9" "tmax10" "tmax11" "tmax12"
## [85] "prec1" "prec2" "prec3" "prec4"
## [89] "prec5" "prec6" "prec7" "prec8"
## [93] "prec9" "prec10" "prec11" "prec12"
## [97] "propSimNorm" "sex" "nEff" "af_nEff"
season.dat <- as.data.frame(afis[ , c(21, 30, 1, 13, 14, 15, 19, 18, 20, 2, 11, 12, 6, 22, 99)])
str(season.dat)
## 'data.frame': 438736 obs. of 15 variables:
## $ type : chr "pooled" "pooled" "pooled" "pooled" ...
## $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ sampleId : chr "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" ...
## $ locality : chr "AT_Mau" "AT_Mau" "AT_Mau" "AT_Mau" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ city : chr "Mauternbach" "Mauternbach" "Mauternbach" "Mauternbach" ...
## $ long : num 15.6 15.6 15.6 15.6 15.6 ...
## $ lat : num 48.4 48.4 48.4 48.4 48.4 ...
## $ season : chr "spring" "spring" "spring" "spring" ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ genotype : chr "A,C" "A,C" "C,T" "T,C" ...
## $ af : num 1 0 0.1569 0.0196 NA ...
## $ col : chr "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ nEff : num 28 26 31 31 NA 36 36 33 24 30 ...
head(season.dat)
## type year sampleId locality country city long lat season
## 1 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 2 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 3 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 4 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 5 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 6 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## variant.id genotype af col continent nEff
## 1 162333 A,C 1.00000000 3_prime_UTR_variant Europe 28
## 2 162334 A,C 0.00000000 3_prime_UTR_variant Europe 26
## 3 162335 C,T 0.15686275 3_prime_UTR_variant Europe 31
## 4 162336 T,C 0.01960784 3_prime_UTR_variant Europe 31
## 5 162337 C,T NA 3_prime_UTR_variant Europe NA
## 6 162338 C,T 0.00000000 3_prime_UTR_variant Europe 36
dim(season.dat)
## [1] 438736 15
season.dat$season[season.dat$season == "frost"] <- "fall"
season.dat$season <- as.factor(season.dat$season)
season.filter.dat <- data.frame()
localities <- levels(as.factor(season.dat$locality))
seasons <- c("fall", "spring")
## getting samples collected at least once during spring and winter
for(loc in localities){
dft <- season.dat[season.dat$locality == loc, ]
if(all(seasons %in% unique(dft$season))){
season.filter.dat <- rbind(season.filter.dat, dft)
}
}
dim(season.filter.dat)
## [1] 285501 15
dim(na.omit(season.filter.dat))
## [1] 269454 15
str(season.filter.dat)
## 'data.frame': 285501 obs. of 15 variables:
## $ type : chr "pooled" "pooled" "pooled" "pooled" ...
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ locality : chr "AT_gr" "AT_gr" "AT_gr" "AT_gr" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ long : num 16.4 16.4 16.4 16.4 16.4 ...
## $ lat : num 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ genotype : chr "A,C" "A,C" "C,T" "T,C" ...
## $ af : num 0.9773 NA 0.2273 0 0.0244 ...
## $ col : chr "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ nEff : num 29 NA 29 33 28 28 27 25 27 25 ...
mat1 <- season.filter.dat[ , c(3, 6, 5, 9, 10, 14, 12, 15)]
names(mat1)
## [1] "sampleId" "city" "country" "season" "variant.id"
## [6] "continent" "af" "nEff"
str(mat1)
## 'data.frame': 285501 obs. of 8 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 NA 0.2273 0 0.0244 ...
## $ nEff : num 29 NA 29 33 28 28 27 25 27 25 ...
dim(mat1)
## [1] 285501 8
mat1 <- mat1[!is.na(mat1$nEff), ]
dim(mat1)
## [1] 269454 8
mat1 <- mat1[mat1$nEff >= 28, ] ## Applying a Neff filter of 28
dim(mat1)
## [1] 212017 8
mat1 <- mat1[ , -8]
str(mat1)
## 'data.frame': 212017 obs. of 7 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 0.2273 0 0.0244 0 ...
## Changing name of cities
mat1$city <- as.factor(mat1$city)
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet \xe0 Gobet" "Charlottesville"
## [9] "Charlotttesville" "Chornobyl (Cooling pond)"
## [11] "Chornobyl Applegarden" "Chornobyl Applegarden (New)"
## [13] "Chornobyl Kopachi" "Chornobyl Polisske"
## [15] "Chornobyl Yaniv" "Cross Plains"
## [17] "Drogobych" "Esparto"
## [19] "Gimenells" "Gotheron"
## [21] "Gross-Enzersdorf" "Homestead"
## [23] "Ithaca" "Kalna"
## [25] "Karensminde" "Karensminde Orchard"
## [27] "Kharkiv" "Kyiv"
## [29] "Kyiv (Vyshneve)" "Lancaster"
## [31] "Linvilla" "Market Harborough"
## [33] "Mauternbach" "Munich"
## [35] "Odesa" "Odessa"
## [37] "Sheffield" "Slankamen. Vinogradi"
## [39] "State College" "Sudbury"
## [41] "Topeka" "Tuolumne"
## [43] "valday" "Valday"
## [45] "Viltain" "Yesiloz"
## [47] "Yesiloz TR13" "Yesiloz TR14"
## [49] "Yesiloz TR15" "Yesiloz TR16"
## [51] "Yesiloz TR17" "Yesiloz TR18"
levels(mat1$city) <- gsub("valday", "Valday", levels(mat1$city))
levels(mat1$city) <- gsub("Odesa", "Odessa", levels(mat1$city))
levels(mat1$city) <- gsub("Charlotttesville", "Charlottesville", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR13", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR14", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR15", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR16", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR17", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR18", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Karensminde Orchard", "Karensminde", levels(mat1$city))
levels(mat1$city) <- gsub("Slankamen. Vinogradi", "Slankamen-Vinogradi", levels(mat1$city))
levels(mat1$city) <- gsub("Chalet \xe0 Gobet", "Chalet-A-Gobet", levels(mat1$city))
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville"
## [9] "Chornobyl (Cooling pond)" "Chornobyl Applegarden"
## [11] "Chornobyl Applegarden (New)" "Chornobyl Kopachi"
## [13] "Chornobyl Polisske" "Chornobyl Yaniv"
## [15] "Cross Plains" "Drogobych"
## [17] "Esparto" "Gimenells"
## [19] "Gotheron" "Gross-Enzersdorf"
## [21] "Homestead" "Ithaca"
## [23] "Kalna" "Karensminde"
## [25] "Kharkiv" "Kyiv"
## [27] "Kyiv (Vyshneve)" "Lancaster"
## [29] "Linvilla" "Market Harborough"
## [31] "Mauternbach" "Munich"
## [33] "Odessa" "Sheffield"
## [35] "Slankamen-Vinogradi" "State College"
## [37] "Sudbury" "Topeka"
## [39] "Tuolumne" "Valday"
## [41] "Viltain" "Yesiloz"
levels(mat1$city)[c(9, 10, 11, 12, 13, 14)] <- "Chornobyl"
levels(mat1$city)
## [1] "Akaa" "Athens" "Barcelona"
## [4] "Benton Harbor" "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville" "Chornobyl"
## [10] "Cross Plains" "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron" "Gross-Enzersdorf"
## [16] "Homestead" "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv" "Kyiv"
## [22] "Kyiv (Vyshneve)" "Lancaster" "Linvilla"
## [25] "Market Harborough" "Mauternbach" "Munich"
## [28] "Odessa" "Sheffield" "Slankamen-Vinogradi"
## [31] "State College" "Sudbury" "Topeka"
## [34] "Tuolumne" "Valday" "Viltain"
## [37] "Yesiloz"
levels(mat1$city)[22] <- "Kyiv"
levels(mat1$city)
## [1] "Akaa" "Athens" "Barcelona"
## [4] "Benton Harbor" "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville" "Chornobyl"
## [10] "Cross Plains" "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron" "Gross-Enzersdorf"
## [16] "Homestead" "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv" "Kyiv"
## [22] "Lancaster" "Linvilla" "Market Harborough"
## [25] "Mauternbach" "Munich" "Odessa"
## [28] "Sheffield" "Slankamen-Vinogradi" "State College"
## [31] "Sudbury" "Topeka" "Tuolumne"
## [34] "Valday" "Viltain" "Yesiloz"
levels(mat1$city)[29] <- "Slankamenacki-Vinogradi"
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville"
## [9] "Chornobyl" "Cross Plains"
## [11] "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron"
## [15] "Gross-Enzersdorf" "Homestead"
## [17] "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv"
## [21] "Kyiv" "Lancaster"
## [23] "Linvilla" "Market Harborough"
## [25] "Mauternbach" "Munich"
## [27] "Odessa" "Sheffield"
## [29] "Slankamenacki-Vinogradi" "State College"
## [31] "Sudbury" "Topeka"
## [33] "Tuolumne" "Valday"
## [35] "Viltain" "Yesiloz"
dim(mat1)
## [1] 212017 7
str(mat1)
## 'data.frame': 212017 obs. of 7 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : Factor w/ 36 levels "Akaa","Athens",..: 15 15 15 15 15 15 15 15 15 15 ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 0.2273 0 0.0244 0 ...
mat1 <- mat1[!mat1$country == "United Kingdom", ] ## The united kingdom only has 4 pooled samples, so I removed it from the analyses
mat2 <- reshape(mat1, idvar = c("sampleId", "city", "country", "season", "continent"), timevar = "variant.id", direction = "wide")
mat2[1:6, 1:10]
## sampleId city country season continent af.162333
## 9679 AT_gr_12_fall Gross-Enzersdorf Austria fall Europe 0.9772727
## 11292 AT_gr_12_spring Gross-Enzersdorf Austria spring Europe 0.8888889
## 1 AT_Mau_14_01 Mauternbach Austria spring Europe 1.0000000
## 1617 AT_Mau_14_02 Mauternbach Austria fall Europe NA
## 3227 AT_Mau_15_50 Mauternbach Austria spring Europe 0.7714286
## 4840 AT_Mau_15_51 Mauternbach Austria fall Europe 0.6984127
## af.162335 af.162336 af.162337 af.162338
## 9679 0.2272727 0.00000000 0.02439024 0
## 11292 NA 0.00000000 NA NA
## 1 0.1568627 0.01960784 NA 0
## 1617 NA 0.00000000 0.00000000 0
## 3227 0.1323529 0.00000000 0.00000000 0
## 4840 0.2656250 0.01282051 0.00000000 0
dim(mat2)
## [1] 170 1614
mat2$city <- as.character(mat2$city)
mat2$season <- as.character(mat2$season)
mat2 <- mat2[mat2$city != "Homestead", ]
mat2 <- mat2[mat2$city != "Athens", ]
mat2 <- mat2[mat2$city != "Sudbury", ]
mat2 <- mat2[mat2$city != "Brzezina", ]
mat2 <- mat2[mat2$city != "Kalna", ]
mat2 <- mat2[mat2$city != "Slankamenacki-Vinogradi", ]
mat2 <- mat2[mat2$city != "Topeka", ]
mat2 <- mat2[mat2$city != "Chalet-A-Gobet", ]
usa <- c("Lancaster", "Ithaca", "Cross Plains", "Benton Harbor", "Linvilla", "State College", "Tuolumne", "Esparto", "Charlottesville")
for(i in usa){
mat2$country[mat2$city == i] <- i
}
mat2$country <- as.factor(mat2$country)
levels(mat2$country)
## [1] "Austria" "Benton Harbor" "Charlottesville" "Cross Plains"
## [5] "Denmark" "Esparto" "Finland" "France"
## [9] "Germany" "Ithaca" "Lancaster" "Linvilla"
## [13] "Russia" "Spain" "State College" "Tuolumne"
## [17] "Turkey" "Ukraine"
levels(mat2$country)[c(2, 4)] <- "MI-WI"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "Ithaca" "Lancaster" "Linvilla" "Russia"
## [13] "Spain" "State College" "Tuolumne" "Turkey"
## [17] "Ukraine"
levels(mat2$country)[c(9, 10)] <- "NY-MA"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "Linvilla" "Russia" "Spain"
## [13] "State College" "Tuolumne" "Turkey" "Ukraine"
levels(mat2$country)[c(10, 13)] <- "PA"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "PA" "Russia" "Spain"
## [13] "Tuolumne" "Turkey" "Ukraine"
mat3 <- as.data.frame(getGenotypeAlleles(genofile)) ## extracting genotypes from the GDS file
mat3[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_Mau_14_01 C/C A/A C/T T/C <NA>
## AT_Mau_14_02 A/C A/A C/T T/T C/C
## AT_Mau_15_50 A/C A/A C/T T/T C/C
## AT_Mau_15_51 A/C <NA> C/T T/C C/C
## AT_See_14_44 A/C A/C C/T T/T C/C
for(i in 1:ncol(mat3)){
mat3[ , i] <- gsub("/", "", mat3[, i])
}
mat3[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_Mau_14_01 CC AA CT TC <NA>
## AT_Mau_14_02 AC AA CT TT CC
## AT_Mau_15_50 AC AA CT TT CC
## AT_Mau_15_51 AC <NA> CT TC CC
## AT_See_14_44 AC AC CT TT CC
mat3$ind <- rownames(mat3)
dim(mat3)
## [1] 272 1614
mat3[1:5, 1610:1614]
## 163994 163995 163996 163997 ind
## AT_Mau_14_01 GA GG GC AA AT_Mau_14_01
## AT_Mau_14_02 GA GG GC AT AT_Mau_14_02
## AT_Mau_15_50 GA GG GC AA AT_Mau_15_50
## AT_Mau_15_51 GA GG GC AA AT_Mau_15_51
## AT_See_14_44 GA GA GC AA AT_See_14_44
mat4 <- mat3[mat3$ind %in% mat2$sampleId, ]
dim(mat4)
## [1] 152 1614
mat4[1:5, 1600:1614]
## 163983 163984 163985 163986 163987 163988 163989 163991 163992
## AT_Mau_14_01 AA AG TT CG AG CC AC CC AG
## AT_Mau_14_02 AA AG TT CG AG CC AC CC AA
## AT_Mau_15_50 AA AG TC CG AG CT AC CG AG
## AT_Mau_15_51 AA AG TT CG AG CC AC CG AA
## AT_gr_12_fall AA AG TT CG AG CT AC CG AA
## 163993 163994 163995 163996 163997 ind
## AT_Mau_14_01 TC GA GG GC AA AT_Mau_14_01
## AT_Mau_14_02 TC GA GG GC AT AT_Mau_14_02
## AT_Mau_15_50 TC GA GG GC AA AT_Mau_15_50
## AT_Mau_15_51 TC GA GG GC AA AT_Mau_15_51
## AT_gr_12_fall TC GA GG GC AA AT_gr_12_fall
same_ord <- mat2[ , c(1, 2, 3)]
colnames(same_ord) <- c("ind", "city", "country")
mat5 <- merge(mat4, same_ord, by = "ind")
dim(mat5)
## [1] 152 1616
mat5[1:5, 1:5]
## ind 162333 162334 162335 162336
## 1 AT_gr_12_fall AC <NA> CT TT
## 2 AT_gr_12_spring AC AA CT TT
## 3 AT_Mau_14_01 CC AA CT TC
## 4 AT_Mau_14_02 AC AA CT TT
## 5 AT_Mau_15_50 AC AA CT TT
rownames(mat5) <- mat5[ , 1]
dim(mat5)
## [1] 152 1616
mat5[1:5, 1610:1616]
## 163993 163994 163995 163996 163997 city country
## AT_gr_12_fall TC GA GG GC AA Gross-Enzersdorf Austria
## AT_gr_12_spring TC GA GG GC AA Gross-Enzersdorf Austria
## AT_Mau_14_01 TC GA GG GC AA Mauternbach Austria
## AT_Mau_14_02 TC GA GG GC AT Mauternbach Austria
## AT_Mau_15_50 TC GA GG GC AA Mauternbach Austria
mat6 <- mat5[ , -c(1, 1615, 1616)]
dim(mat6)
## [1] 152 1613
mat6[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_gr_12_fall AC <NA> CT TT CT
## AT_gr_12_spring AC AA CT TT CC
## AT_Mau_14_01 CC AA CT TC <NA>
## AT_Mau_14_02 AC AA CT TT CC
## AT_Mau_15_50 AC AA CT TT CC
mat6[1:5, 1609:1613]
## 163993 163994 163995 163996 163997
## AT_gr_12_fall TC GA GG GC AA
## AT_gr_12_spring TC GA GG GC AA
## AT_Mau_14_01 TC GA GG GC AA
## AT_Mau_14_02 TC GA GG GC AT
## AT_Mau_15_50 TC GA GG GC AA
fly_gen <- df2genind(mat6, ploidy = 2, ind.names = rownames(mat6), pop = mat5$country, sep = "")
fly_gen
## /// GENIND OBJECT /////////
##
## // 152 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 152 x 3225 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 1-2)
## @loc.fac: locus factor for the 3225 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: df2genind(X = mat6, sep = "", ind.names = rownames(mat6), pop = mat5$country,
## ploidy = 2)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
fly_gen$tab[1:5, 1:5]
## 162333.A 162333.C 162334.A 162334.C 162335.C
## AT_gr_12_fall 1 1 NA NA 1
## AT_gr_12_spring 1 1 2 0 1
## AT_Mau_14_01 0 2 2 0 1
## AT_Mau_14_02 1 1 2 0 1
## AT_Mau_15_50 1 1 2 0 1
summary(fly_gen$pop)
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 18
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
indmiss_fly <- propTyped(fly_gen, by = "ind")
indmiss_fly[which(indmiss_fly < 0.80)]
## PA_li_10_spring
## 0.7123373
barplot(indmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)
fly_gen <- missingno(fly_gen, type = "geno", cutoff = 0.20) ## This is the genind file that I will use for all the analyses
fly_gen
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 151 x 3225 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 1-2)
## @loc.fac: locus factor for the 3225 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
## quality control begins here
mlg(fly_gen) ## keep only polymorphic loci
## #############################
## # Number of Individuals: 151
## # Number of MLG: 151
## #############################
## [1] 151
isPoly(fly_gen) %>% summary
## Mode FALSE TRUE
## logical 1 1612
poly_loci <- names(which(isPoly(fly_gen) == TRUE))
fly_gen2 <- fly_gen[loc = poly_loci]
isPoly(fly_gen2) %>% summary
## Mode TRUE
## logical 1612
fly_gen2$loc.n.all <- fly_gen2$loc.n.all[which(fly_gen2$loc.n.all == 2)]
n <- names(which(nAll(fly_gen2) == 2))
fly_gen2 <- fly_gen2[loc = n]
fly_gen2
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,612 loci; 3,224 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 151 x 3224 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3224 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
indmiss_fly2 <- propTyped(fly_gen2, by = "ind")
barplot(indmiss_fly2, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)
locmiss_fly <- propTyped(fly_gen2, by = "loc")
locmiss_fly[which(locmiss_fly < 0.80)] ## print loci with < 80% complete genotypes
## 162334 162396 162412 162545 162546 162547 162549 162551
## 0.6225166 0.6092715 0.7880795 0.7947020 0.7682119 0.7947020 0.7748344 0.7880795
## 162553 162554 162555 162557 162560 162563 162587 162623
## 0.7880795 0.7814570 0.7880795 0.7947020 0.7814570 0.7947020 0.7748344 0.5298013
## 162746 162755 162756 162882 162892 162986 162987 162988
## 0.4238411 0.5562914 0.5562914 0.7748344 0.4238411 0.7152318 0.7152318 0.7284768
## 162989 162990 162992 163029 163072 163073 163074 163075
## 0.7284768 0.7284768 0.7549669 0.6291391 0.4834437 0.4370861 0.4834437 0.4834437
## 163076 163118 163119 163120 163121 163122 163123 163124
## 0.4834437 0.6688742 0.6688742 0.6688742 0.6688742 0.6688742 0.6158940 0.6158940
## 163125 163307 163363 163365 163366 163608 163648 163650
## 0.4370861 0.4768212 0.7947020 0.7947020 0.7483444 0.7350993 0.5165563 0.5827815
## 163692 163715 163716 163717 163765 163821 163908 163910
## 0.4569536 0.7086093 0.7086093 0.7086093 0.6357616 0.6092715 0.5231788 0.5629139
## 163914 163915 163916 163917 163922 163958
## 0.6158940 0.5894040 0.5827815 0.5761589 0.6953642 0.7019868
barplot(locmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)
fly_gen3 <- missingno(fly_gen2, type = "loci", cutoff = 0.20)
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
locmiss_fly3 <- propTyped(fly_gen3, by = "loc")
locmiss_fly3[which(locmiss_fly3 < 0.80)]
## named numeric(0)
fly_gen3$tab[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
summary(fly_gen3$pop)
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 17
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
mlg(fly_gen3)
## #############################
## # Number of Individuals: 151
## # Number of MLG: 151
## #############################
## [1] 151
isPoly(fly_gen3) %>% summary
## Mode TRUE
## logical 1550
poly_loci3 <- names(which(isPoly(fly_gen3) == TRUE))
fly_gen3 <- fly_gen3[loc = poly_loci3]
isPoly(fly_gen3) %>% summary
## Mode TRUE
## logical 1550
barplot(locmiss_fly3, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)
## computing basic stats
basic_fly <- basic.stats(fly_gen3, diploid = TRUE)
## testing for H-W equilibrium
## Chi-squared test: p-value
HWE.test <- data.frame(sapply(seppop(fly_gen3),
function(ls) pegas::hw.test(ls, B = 0)[ , 3]))
HWE.test.chisq <- t(data.matrix(HWE.test))
HWE.test.chisq[1:5, 1:5]
## 162333 162335 162336 162337 162338
## Austria 0.08018122 0.01430588 0.6242061 0.8037847 1.0000000
## MI.WI 0.08968602 0.04722090 1.0000000 1.0000000 0.5485062
## Charlottesville 0.23013934 0.02534732 1.0000000 0.8237839 0.8037847
## Denmark 0.33790402 0.02534732 1.0000000 0.5761501 1.0000000
## Esparto 0.23013934 0.04550026 1.0000000 0.7750970 0.8037847
## Monte Carlo: p-value
HWE.test <- data.frame(sapply(seppop(fly_gen3),
function(ls) pegas::hw.test(ls, B = 1000)[ , 4]))
HWE.test.MC <- t(data.matrix(HWE.test))
HWE.test.MC[1:5, 1:5]
## 162333 162335 162336 162337 162338
## Austria 0.404 0.087 1 1 1
## MI.WI 0.437 0.174 1 1 1
## Charlottesville 1.000 0.147 1 1 1
## Denmark 1.000 0.111 1 1 1
## Esparto 1.000 0.320 1 1 1
alpha <- 0.05
Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq < alpha, 2, mean), MC = apply(HWE.test.MC < alpha, 2, mean))
Prop.loci.out.of.HWE[1:10, ]
## Chisq MC
## 162333 0.3333333 0.2666667
## 162335 1.0000000 0.4000000
## 162336 0.0000000 0.0000000
## 162337 0.0000000 0.0000000
## 162338 0.0000000 0.0000000
## 162339 0.0000000 0.0000000
## 162340 0.0000000 0.0000000
## 162341 1.0000000 0.4666667
## 162342 0.0000000 0.0000000
## 162343 0.0000000 0.0000000
## "False discovery rate" correction for the number of tests. Here I use the function ‘p.adjust’ with the argument method = "fdr" to adjust the p-values from the previous tests
Chisq.fdr <- matrix(p.adjust(HWE.test.chisq, method = "fdr"),
nrow = nrow(HWE.test.chisq))
MC.fdr <- matrix(p.adjust(HWE.test.MC, method = "fdr"),
nrow = nrow(HWE.test.MC))
Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq<alpha, 2, mean),
MC = apply(HWE.test.MC<alpha, 2, mean),
Chisq.fdr = apply(Chisq.fdr<alpha, 2, mean),
MC.fdr = apply(MC.fdr<alpha, 2, mean))
head(Prop.loci.out.of.HWE)
## Chisq MC Chisq.fdr MC.fdr
## 162333 0.3333333 0.2666667 0.2666667 0.1333333
## 162335 1.0000000 0.4000000 0.6000000 0.2000000
## 162336 0.0000000 0.0000000 0.0000000 0.0000000
## 162337 0.0000000 0.0000000 0.0000000 0.0000000
## 162338 0.0000000 0.0000000 0.0000000 0.0000000
## 162339 0.0000000 0.0000000 0.0000000 0.0000000
loci <- data.frame()
idx <- 1
for(i in 1:nrow(Prop.loci.out.of.HWE)){
fdr <- Prop.loci.out.of.HWE[i , ]
if(fdr$MC.fdr > 0.5){
loci <- rbind(fdr, loci)
}
}
loci ## none of the loci are consistenly out of HWE. Some of them are out of HWE in less than 50% of the populations.
## data frame with 0 columns and 0 rows
dim(loci)
## [1] 0 0
## check for linkage disequilibrium (LD)
LD <- as.genclone(fly_gen3)
LD
##
## This is a genclone object
## -------------------------
## Genotype information:
##
## 151 original multilocus genotypes
## 151 diploid individuals
## 1550 codominant loci
##
## Population information:
##
## 0 strata.
## 15 populations defined -
## Austria, MI-WI, Charlottesville, ..., Tuolumne, Turkey, Ukraine
quartz()
LD.general <- poppr::ia(LD, sample = 500)
LD.general
## Ia p.Ia rbarD p.rD
## 15.592940300 0.001996008 0.013771123 0.001996008
Computing pairwise Fst test
fly_fst <- genet.dist(fly_gen3, method = "WC84") %>% round(digits = 3)
fly_fst[[1:10, 1:10]]
## Austria MI-WI Charlottesville Denmark Esparto Finland France
## MI-WI 0.053
## Charlottesville 0.058 0.005
## Denmark 0.018 0.049 0.054
## Esparto 0.056 0.012 0.017 0.049
## Finland 0.015 0.049 0.057 0.024 0.054
## France 0.021 0.042 0.045 0.018 0.041 0.022
## Germany 0.012 0.041 0.046 0.016 0.043 0.013 0.010
## NY-MA 0.044 0.006 0.003 0.040 0.009 0.044 0.033
## PA 0.053 0.004 0.004 0.050 0.013 0.051 0.044
## Germany NY-MA
## MI-WI
## Charlottesville
## Denmark
## Esparto
## Finland
## France
## Germany
## NY-MA 0.034
## PA 0.041 0.002
fst.mat <- as.matrix(fly_fst)
fst.mat[fst.mat < 0] <- 0
fst.mat[1:10, 1:10]
## Austria MI-WI Charlottesville Denmark Esparto Finland France
## Austria 0.000 0.053 0.058 0.018 0.056 0.015 0.021
## MI-WI 0.053 0.000 0.005 0.049 0.012 0.049 0.042
## Charlottesville 0.058 0.005 0.000 0.054 0.017 0.057 0.045
## Denmark 0.018 0.049 0.054 0.000 0.049 0.024 0.018
## Esparto 0.056 0.012 0.017 0.049 0.000 0.054 0.041
## Finland 0.015 0.049 0.057 0.024 0.054 0.000 0.022
## France 0.021 0.042 0.045 0.018 0.041 0.022 0.000
## Germany 0.012 0.041 0.046 0.016 0.043 0.013 0.010
## NY-MA 0.044 0.006 0.003 0.040 0.009 0.044 0.033
## PA 0.053 0.004 0.004 0.050 0.013 0.051 0.044
## Germany NY-MA PA
## Austria 0.012 0.044 0.053
## MI-WI 0.041 0.006 0.004
## Charlottesville 0.046 0.003 0.004
## Denmark 0.016 0.040 0.050
## Esparto 0.043 0.009 0.013
## Finland 0.013 0.044 0.051
## France 0.010 0.033 0.044
## Germany 0.000 0.034 0.041
## NY-MA 0.034 0.000 0.002
## PA 0.041 0.002 0.000
tab <- gi2gl(fly_gen3, parallel = FALSE, verbose = NULL) ## converting genind object to genlight object
## Starting gi2gl
## Starting gl.compliance.check
## Processing genlight object with SNP data
## Checking coding of SNPs
## SNP data scored NA, 0, 1 or 2 confirmed
## Checking locus metrics and flags
## Recalculating locus metrics
## Checking for monomorphic loci
## No monomorphic loci detected
## Checking for loci with all missing data
## No loci with all missing data detected
## Checking whether individual names are unique.
## Checking for individual metrics
## Warning: Creating a slot for individual metrics
## Checking for population assignments
## Population assignments confirmed
## Spelling of coordinates checked and changed if necessary to
## lat/lon
## Completed: gl.compliance.check
## Completed: gi2gl
pval_fst <- gl.fst.pop(tab, nboots = 1000, percent = 95, nclusters = 1, verbose = NULL)
## Starting gl.fst.pop
## Processing genlight object with SNP data
## Completed: gl.fst.pop
fst <- as.matrix(as.dist(pval_fst$Fsts))
fst[fst < 0] <- 0
fst
## Austria Esparto Tuolumne Germany Denmark
## Austria 0.00000000 0.055703474 0.058895752 0.012410932 0.01800624
## Esparto 0.05570347 0.000000000 0.000000000 0.043366976 0.04899978
## Tuolumne 0.05889575 0.000000000 0.000000000 0.046380979 0.05326210
## Germany 0.01241093 0.043366976 0.046380979 0.000000000 0.01632085
## Denmark 0.01800624 0.048999783 0.053262097 0.016320855 0.00000000
## Spain 0.01644031 0.044356991 0.047779471 0.007609145 0.02535346
## Finland 0.01510726 0.054481851 0.060073773 0.013480562 0.02446933
## France 0.02086204 0.040896003 0.045503573 0.009973570 0.01796563
## NY-MA 0.04387058 0.008737692 0.007955468 0.033551239 0.04003517
## MI-WI 0.05336238 0.012416407 0.008962355 0.041320541 0.04880945
## PA 0.05310220 0.013471651 0.009573028 0.041313299 0.05015960
## Russia 0.01689966 0.053516381 0.060654501 0.020656074 0.02332836
## Turkey 0.01412191 0.060541318 0.062921149 0.018900400 0.02179383
## Ukraine 0.01416309 0.063774691 0.069481577 0.020327235 0.02369749
## Charlottesville 0.05819820 0.017272103 0.015222898 0.046035247 0.05402774
## Spain Finland France NY-MA MI-WI
## Austria 0.016440309 0.01510726 0.02086204 0.043870575 0.053362379
## Esparto 0.044356991 0.05448185 0.04089600 0.008737692 0.012416407
## Tuolumne 0.047779471 0.06007377 0.04550357 0.007955468 0.008962355
## Germany 0.007609145 0.01348056 0.00997357 0.033551239 0.041320541
## Denmark 0.025353461 0.02446933 0.01796563 0.040035167 0.048809450
## Spain 0.000000000 0.02001428 0.01567832 0.029541647 0.039313950
## Finland 0.020014275 0.00000000 0.02212788 0.043508846 0.048678641
## France 0.015678321 0.02212788 0.00000000 0.033457193 0.042244680
## NY-MA 0.029541647 0.04350885 0.03345719 0.000000000 0.005774646
## MI-WI 0.039313950 0.04867864 0.04224468 0.005774646 0.000000000
## PA 0.038958914 0.05145801 0.04367546 0.001936861 0.004351338
## Russia 0.031639501 0.01913517 0.02199934 0.047419819 0.052957069
## Turkey 0.026809159 0.01677821 0.02823027 0.049358359 0.054304445
## Ukraine 0.031831900 0.02245250 0.02881299 0.055798661 0.064183546
## Charlottesville 0.046471038 0.05728551 0.04450952 0.002951612 0.004506388
## PA Russia Turkey Ukraine Charlottesville
## Austria 0.053102204 0.016899657 0.01412191 0.014163086 0.058198201
## Esparto 0.013471651 0.053516381 0.06054132 0.063774691 0.017272103
## Tuolumne 0.009573028 0.060654501 0.06292115 0.069481577 0.015222898
## Germany 0.041313299 0.020656074 0.01890040 0.020327235 0.046035247
## Denmark 0.050159598 0.023328359 0.02179383 0.023697489 0.054027742
## Spain 0.038958914 0.031639501 0.02680916 0.031831900 0.046471038
## Finland 0.051458012 0.019135166 0.01677821 0.022452499 0.057285510
## France 0.043675456 0.021999335 0.02823027 0.028812985 0.044509519
## NY-MA 0.001936861 0.047419819 0.04935836 0.055798661 0.002951612
## MI-WI 0.004351338 0.052957069 0.05430445 0.064183546 0.004506388
## PA 0.000000000 0.058255707 0.05610334 0.066041832 0.004019586
## Russia 0.058255707 0.000000000 0.01717863 0.007515034 0.059645911
## Turkey 0.056103340 0.017178630 0.00000000 0.012939846 0.058036081
## Ukraine 0.066041832 0.007515034 0.01293985 0.000000000 0.067300473
## Charlottesville 0.004019586 0.059645911 0.05803608 0.067300473 0.000000000
my_palette <- colorRampPalette(c("green", "yellow", "red"))(n = 100)
heatmap.2(fst.mat, density.info = "none", trace = "none", scale = "none", cexRow = 0.7, cexCol = 0.7, key.title = "Fst", srtCol = 45, srtRow = 35, margins = c(8.5, 5), col = my_palette)
Figure 1. Heatmap depicting genetic differentiation among populations of D. melanogaster based on pairwise Fst values. In some cases, samples from adjacent localities were combined to avoid low sample sizes. Abbreviations are as follows: PA = Pennsylvania; MI-WI = Michigan and Wisconsin; NY-MA = New York and Massachusetts.
OutFlANK outlier analysis
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
snps <- as.data.frame(nAll(fly_gen3))
indi <- as.data.frame(fly_gen3$tab)
rownames(snps)[1:10]
## [1] "162333" "162335" "162336" "162337" "162338" "162339" "162340" "162341"
## [9] "162342" "162343"
rownames(indi)[1:10]
## [1] "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02"
## [5] "AT_Mau_15_50" "AT_Mau_15_51" "CA_es_12_fall" "CA_es_12_frost"
## [9] "CA_es_12_spring" "CA_es_13_fall"
seqSetFilter(genofile, variant.id = rownames(snps), sample.id = rownames(indi))
## # of selected samples: 151
## # of selected variants: 1,550
##seqGDS2VCF(genofile, "my_vcf.gz", verbose = TRUE)
outest <- read.vcfR("my_vcf.gz")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 44
## header_line: 45
## variant count: 1550
## column count: 160
##
Meta line 44 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 1550
## Character matrix gt cols: 160
## skip: 0
## nrows: 1550
## row_num: 0
##
Processed variant 1000
Processed variant: 1550
## All variants processed
dataout <- vcfR2genind(outest)
dataout$pop <- as.factor(fly_gen3$pop)
outflnk <- gl.outflank(dataout, qthreshold = 0.05, plot = FALSE)
## Calculating FSTs, may take a few minutes...
outflnk.df <- outflnk$outflank$results
rowsToRemove <- seq(1, nrow(outflnk.df), by = 2)
outflnk.df <- outflnk.df[-rowsToRemove, ]
outflnk.df$OutlierFlag %>% summary
## Mode FALSE NA's
## logical 1237 313
outflnk.df <- na.omit(outflnk.df)
outflnk.df$FST[outflnk.df$FST < 0] <- 0
head(outflnk.df)
## LocusName He FST T1 T2 FSTNoCorr
## 2 2L_3622148.0 0.4791145 0.009766519 2.345072e-03 0.24011342 0.028452713
## 4 2L_3622167.1 0.4998965 0.000000000 -2.026437e-05 0.24997177 0.001549929
## 6 2L_3622402.1 0.1118148 0.032767588 1.844529e-03 0.05629127 0.081124841
## 8 2L_3622520.1 0.1699219 0.086990263 7.484706e-03 0.08604073 0.127168698
## 10 2L_3622544.1 0.1638000 0.028275732 2.329770e-03 0.08239470 0.072847321
## 12 2L_3622548.1 0.2225125 0.076400369 8.594932e-03 0.11249857 0.113994917
## T1NoCorr T2NoCorr meanAlleleFreq indexOrder GoodH qvalues pvalues
## 2 0.0068386640 0.24035192 0.6021898 2 goodH 0.6691213 0.68774049
## 4 0.0003874701 0.24999218 0.5071942 4 goodH 0.7222558 0.04538728
## 6 0.0045779572 0.05643102 0.9405594 6 goodH 0.4579630 0.60148959
## 8 0.0109645441 0.08622046 0.9062500 8 goodH 0.3903686 0.30413760
## 10 0.0060152736 0.08257371 0.9100000 10 goodH 0.4733307 0.67993881
## 12 0.0128484360 0.11271060 0.8724832 12 goodH 0.4080323 0.36966216
## pvaluesRightTail OutlierFlag
## 2 0.6561298 FALSE
## 4 0.9773064 FALSE
## 6 0.3007448 FALSE
## 8 0.1520688 FALSE
## 10 0.3399694 FALSE
## 12 0.1848311 FALSE
## checking assumptions
plot(outflnk.df$FST, outflnk.df$FSTNoCorr, pch = 19, col = alpha("black", 0.3), ylab = "Fst uncorrected", xlab = "Fst corrected", las = 1, main = "Loci deviation from the linear relationship")
abline(0, 1, col = "skyblue", lwd = 2)
## Italic labels
fstlab <- expression(italic("F")[ST])
hetlab <- expression(italic("H")[e])
## Plot He versus Fst
outFlank <- ggplot(data = outflnk.df) +
geom_point(aes(x = He, y = FST, colour = OutlierFlag)) +
scale_colour_manual(values = c("black","red"), labels = c("Neutral SNP","Outlier SNP")) +
xlab(hetlab) +
ylab(fstlab) +
theme(legend.title = element_blank(),
)
outFlank
Figure 3. Manhattan plot of the distribution of Fst loci as a function of their heterozygosity. No Fst loci showed significant deviation from the genetic background.
Genetic admixture proportions
## exploring the data once again (do it all the time!)
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
fly_gen3$tab[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
nLoc(fly_gen3) # number of loci
## [1] 1550
nPop(fly_gen3) # number of sites
## [1] 15
nInd(fly_gen3) # number of individuals
## [1] 151
summary(fly_gen3$pop) # sample size
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 17
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
snmf1 <- load.snmfProject("gl_geno.snmfProject")
## extract Q-matrix for the best run
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1)
## extract the cross-entropy of all runs where K = 9
ce <- cross.entropy(snmf1, K = 9)
ce
## K = 9
## run 1 0.2868380
## run 2 0.2973042
## run 3 0.2951137
## run 4 0.2993486
## run 5 0.2950550
## run 6 0.2978129
## run 7 0.2963995
## run 8 0.3005061
## run 9 0.2952275
## run 10 0.2980091
## run 11 0.2930379
## run 12 0.2862978
## run 13 0.2982294
## run 14 0.2954692
## run 15 0.3006121
## run 16 0.3052267
## run 17 0.2922759
## run 18 0.2974110
## run 19 0.3013653
## run 20 0.3033054
## run 21 0.2973955
## run 22 0.3023131
## run 23 0.3005838
## run 24 0.2985870
## run 25 0.3010128
## run 26 0.2873666
## run 27 0.2937488
## run 28 0.2929741
## run 29 0.3025899
## run 30 0.2974862
## run 31 0.2986869
## run 32 0.3019270
## run 33 0.2917947
## run 34 0.2929909
## run 35 0.2986107
## run 36 0.2974049
## run 37 0.3030945
## run 38 0.2906817
## run 39 0.3023015
## run 40 0.3027009
## run 41 0.3007979
## run 42 0.3054084
## run 43 0.3001154
## run 44 0.2986051
## run 45 0.3028176
## run 46 0.2986939
## run 47 0.3030751
## run 48 0.3048518
## run 49 0.3040839
## run 50 0.3093110
## find the run with the lowest cross-entropy
lowest.ce <- which.min(ce)
lowest.ce
## [1] 12
## extract Q-matrix for the best run
qmatrix <- as.data.frame(Q(snmf1, K = 9, run = lowest.ce))
head(qmatrix)
## V1 V2 V3 V4 V5 V6
## 1 0.048722400 0.309613000 9.99639e-05 9.99639e-05 0.5840360 9.99639e-05
## 2 0.261074000 0.049324700 8.74701e-03 9.99730e-05 0.2930510 1.24752e-01
## 3 0.126625000 0.290528000 9.99730e-05 9.99730e-05 0.3630220 6.01603e-03
## 4 0.083122600 0.000099973 1.66848e-02 9.99730e-05 0.0992241 9.99730e-05
## 5 0.000099955 0.529978000 9.99550e-05 9.55478e-03 0.2290610 9.99550e-05
## 6 0.000099964 0.096673000 9.99640e-05 9.99640e-05 0.3324140 9.99640e-05
## V7 V8 V9
## 1 9.99639e-05 0.040000000 0.017229500
## 2 9.99730e-05 0.000099973 0.262751000
## 3 2.46401e-02 0.188868000 0.000099973
## 4 5.04343e-02 0.648635000 0.101599000
## 5 9.99550e-05 0.230907000 0.000099955
## 6 8.43814e-02 0.449977000 0.036154800
## changing order of levels
pops <- fly_gen3$pop
levels(pops)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "PA" "Russia" "Spain"
## [13] "Tuolumne" "Turkey" "Ukraine"
levels(pops)[3] <- "Charlott."
qmplot <- cbind(qmatrix, pops)
qmplot$pops <- factor(qmplot$pops, levels = c("Esparto", "Tuolumne", "MI-WI", "Charlott.", "PA", "NY-MA", "Spain", "France", "Germany", "Austria", "Denmark", "Finland", "Ukraine", "Russia", "Turkey"))
qmplot <- qmplot[order(qmplot$pops), ]
pops <- qmplot$pops
layout(matrix(c(0, 1, 1, 0,
0, 1, 1, 0,
2, 2, 2, 2,
2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))
par(mar = c(4, 4, 1, 0))
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1, cex.lab = 1.3)
Arrows(x = 9, y = 0.295, x1 = 9, y1 = 0.305, col = "red", arr.type = "triangle", code = 1, lwd = 1.5, arr.length = 0.2)
mtext("(a)", side = 2, at = 0.345, line = 2.8, font = 2, family = "serif", las = 1)
par(mar = c(5, 4.5, 1, 0))
barplot(t(qmplot[1:9]), col = RColorBrewer::brewer.pal(9,"Paired"), border = NA, space = 0, xlab = "", xaxt = "n", ylab = "Admixture proportion", las = 1, cex.lab = 1.4)
## adding population labels to the axis:
names <- unique(as.character(pops))
medians <- c()
for(i in 1:length(pops)){
axis(1, at = median(which(pops == pops[i])), labels = "")
medians <- c(medians, median(which(pops == pops[i])))
}
medians[1:10]
## [1] 3 3 3 3 3 8 8 8 8 8
text(x = as.numeric(unique(as.character(medians))), y = par("usr")[3] - 0.05, labels = names, xpd = NA, srt = 35, cex = 1, adj = 1.2)
mtext("Individuals", side = 1, line = 4)
mtext("(b)", side = 2, at = 1.1, line = 2.4, font = 2, family = "serif", las = 1)
Figure 4. Population structure analysis based on individual ancestry coefficients for a number of ancestral populations. (a) Cross-entropy values for each snmf run with k ranging between k=1 and k=10. The red arrow indicates the most likely value of k. (b) Admixture proportion across populations of D. melanogaster. Colors indicate genetic clusters.
## label column names of qmatrix
ncol(qmatrix)
## [1] 9
cluster_names = c()
for(i in 1:ncol(qmatrix)){
cluster_names[i] = paste("Cluster", i)
}
cluster_names
## [1] "Cluster 1" "Cluster 2" "Cluster 3" "Cluster 4" "Cluster 5" "Cluster 6"
## [7] "Cluster 7" "Cluster 8" "Cluster 9"
colnames(qmatrix) <- cluster_names
head(qmatrix)
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## 1 0.048722400 0.309613000 9.99639e-05 9.99639e-05 0.5840360 9.99639e-05
## 2 0.261074000 0.049324700 8.74701e-03 9.99730e-05 0.2930510 1.24752e-01
## 3 0.126625000 0.290528000 9.99730e-05 9.99730e-05 0.3630220 6.01603e-03
## 4 0.083122600 0.000099973 1.66848e-02 9.99730e-05 0.0992241 9.99730e-05
## 5 0.000099955 0.529978000 9.99550e-05 9.55478e-03 0.2290610 9.99550e-05
## 6 0.000099964 0.096673000 9.99640e-05 9.99640e-05 0.3324140 9.99640e-05
## Cluster 7 Cluster 8 Cluster 9
## 1 9.99639e-05 0.040000000 0.017229500
## 2 9.99730e-05 0.000099973 0.262751000
## 3 2.46401e-02 0.188868000 0.000099973
## 4 5.04343e-02 0.648635000 0.101599000
## 5 9.99550e-05 0.230907000 0.000099955
## 6 8.43814e-02 0.449977000 0.036154800
dim(qmatrix)
## [1] 151 9
## add individual IDs
qmatrix$Ind <- indNames(fly_gen3)
## add site IDs
qmatrix$Site <- fly_gen3$pop
head(qmatrix)
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## 1 0.048722400 0.309613000 9.99639e-05 9.99639e-05 0.5840360 9.99639e-05
## 2 0.261074000 0.049324700 8.74701e-03 9.99730e-05 0.2930510 1.24752e-01
## 3 0.126625000 0.290528000 9.99730e-05 9.99730e-05 0.3630220 6.01603e-03
## 4 0.083122600 0.000099973 1.66848e-02 9.99730e-05 0.0992241 9.99730e-05
## 5 0.000099955 0.529978000 9.99550e-05 9.55478e-03 0.2290610 9.99550e-05
## 6 0.000099964 0.096673000 9.99640e-05 9.99640e-05 0.3324140 9.99640e-05
## Cluster 7 Cluster 8 Cluster 9 Ind Site
## 1 9.99639e-05 0.040000000 0.017229500 AT_gr_12_fall Austria
## 2 9.99730e-05 0.000099973 0.262751000 AT_gr_12_spring Austria
## 3 2.46401e-02 0.188868000 0.000099973 AT_Mau_14_01 Austria
## 4 5.04343e-02 0.648635000 0.101599000 AT_Mau_14_02 Austria
## 5 9.99550e-05 0.230907000 0.000099955 AT_Mau_15_50 Austria
## 6 8.43814e-02 0.449977000 0.036154800 AT_Mau_15_51 Austria
## calculate mean admixture proportions for each site
clusters <- grep("Cluster", names(qmatrix)) ## indexes of cluster columns
avg_admix <- aggregate(qmatrix[, clusters], list(qmatrix$Site, qmatrix$Ind), mean)
colnames(avg_admix)[1:2] <- c("country", "Ind")
head(avg_admix)
## country Ind Cluster 1 Cluster 2 Cluster 3 Cluster 4
## 1 Austria AT_gr_12_fall 0.048722400 0.309613000 9.99639e-05 9.99639e-05
## 2 Austria AT_gr_12_spring 0.261074000 0.049324700 8.74701e-03 9.99730e-05
## 3 Austria AT_Mau_14_01 0.126625000 0.290528000 9.99730e-05 9.99730e-05
## 4 Austria AT_Mau_14_02 0.083122600 0.000099973 1.66848e-02 9.99730e-05
## 5 Austria AT_Mau_15_50 0.000099955 0.529978000 9.99550e-05 9.55478e-03
## 6 Austria AT_Mau_15_51 0.000099964 0.096673000 9.99640e-05 9.99640e-05
## Cluster 5 Cluster 6 Cluster 7 Cluster 8 Cluster 9
## 1 0.5840360 9.99639e-05 9.99639e-05 0.040000000 0.017229500
## 2 0.2930510 1.24752e-01 9.99730e-05 0.000099973 0.262751000
## 3 0.3630220 6.01603e-03 2.46401e-02 0.188868000 0.000099973
## 4 0.0992241 9.99730e-05 5.04343e-02 0.648635000 0.101599000
## 5 0.2290610 9.99550e-05 9.99550e-05 0.230907000 0.000099955
## 6 0.3324140 9.99640e-05 8.43814e-02 0.449977000 0.036154800
str(avg_admix)
## 'data.frame': 151 obs. of 11 variables:
## $ country : Factor w/ 15 levels "Austria","MI-WI",..: 1 1 1 1 1 1 5 5 5 5 ...
## $ Ind : chr "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
## $ Cluster 1: num 0.0487 0.2611 0.1266 0.0831 0.0001 ...
## $ Cluster 2: num 0.3096 0.0493 0.2905 0.0001 0.53 ...
## $ Cluster 3: num 0.0001 0.00875 0.0001 0.01668 0.0001 ...
## $ Cluster 4: num 0.0001 0.0001 0.0001 0.0001 0.00955 ...
## $ Cluster 5: num 0.584 0.2931 0.363 0.0992 0.2291 ...
## $ Cluster 6: num 0.0001 0.12475 0.00602 0.0001 0.0001 ...
## $ Cluster 7: num 0.0001 0.0001 0.0246 0.0504 0.0001 ...
## $ Cluster 8: num 0.04 0.0001 0.1889 0.6486 0.2309 ...
## $ Cluster 9: num 0.0172 0.2628 0.0001 0.1016 0.0001 ...
## import csv file containing coordinates
coor <- read.csv("coor.csv")
str(coor)
## 'data.frame': 16 obs. of 3 variables:
## $ country: chr "Austria" "Esparto" "Tuolumne" "Germany" ...
## $ long : num 15.96 -122.05 -120.26 9.71 10.21 ...
## $ lat : num 48.3 38.7 38 48.2 55.9 ...
head(coor)
## country long lat
## 1 Austria 15.965000 48.28750
## 2 Esparto -122.046000 38.68000
## 3 Tuolumne -120.260000 37.97000
## 4 Germany 9.714757 48.19901
## 5 Denmark 10.212586 55.94513
## 6 Spain 1.279236 41.51821
admix <- merge(coor, avg_admix, by = "country")
str(admix)
## 'data.frame': 151 obs. of 13 variables:
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ long : num 16 16 16 16 16 ...
## $ lat : num 48.3 48.3 48.3 48.3 48.3 ...
## $ Ind : chr "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
## $ Cluster 1: num 0.0487 0.2611 0.1266 0.0831 0.0001 ...
## $ Cluster 2: num 0.3096 0.0493 0.2905 0.0001 0.53 ...
## $ Cluster 3: num 0.0001 0.00875 0.0001 0.01668 0.0001 ...
## $ Cluster 4: num 0.0001 0.0001 0.0001 0.0001 0.00955 ...
## $ Cluster 5: num 0.584 0.2931 0.363 0.0992 0.2291 ...
## $ Cluster 6: num 0.0001 0.12475 0.00602 0.0001 0.0001 ...
## $ Cluster 7: num 0.0001 0.0001 0.0246 0.0504 0.0001 ...
## $ Cluster 8: num 0.04 0.0001 0.1889 0.6486 0.2309 ...
## $ Cluster 9: num 0.0172 0.2628 0.0001 0.1016 0.0001 ...
head(admix)
## country long lat Ind Cluster 1 Cluster 2 Cluster 3
## 1 Austria 15.965 48.2875 AT_gr_12_fall 0.048722400 0.309613000 9.99639e-05
## 2 Austria 15.965 48.2875 AT_gr_12_spring 0.261074000 0.049324700 8.74701e-03
## 3 Austria 15.965 48.2875 AT_Mau_14_01 0.126625000 0.290528000 9.99730e-05
## 4 Austria 15.965 48.2875 AT_Mau_14_02 0.083122600 0.000099973 1.66848e-02
## 5 Austria 15.965 48.2875 AT_Mau_15_50 0.000099955 0.529978000 9.99550e-05
## 6 Austria 15.965 48.2875 AT_Mau_15_51 0.000099964 0.096673000 9.99640e-05
## Cluster 4 Cluster 5 Cluster 6 Cluster 7 Cluster 8 Cluster 9
## 1 9.99639e-05 0.5840360 9.99639e-05 9.99639e-05 0.040000000 0.017229500
## 2 9.99730e-05 0.2930510 1.24752e-01 9.99730e-05 0.000099973 0.262751000
## 3 9.99730e-05 0.3630220 6.01603e-03 2.46401e-02 0.188868000 0.000099973
## 4 9.99730e-05 0.0992241 9.99730e-05 5.04343e-02 0.648635000 0.101599000
## 5 9.55478e-03 0.2290610 9.99550e-05 9.99550e-05 0.230907000 0.000099955
## 6 9.99640e-05 0.3324140 9.99640e-05 8.43814e-02 0.449977000 0.036154800
colnames(admix)[c(5, 6, 7, 8, 9, 10, 11, 12, 13)] <- c("cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6", "cluster7", "cluster8", "cluster9")
Plotting admixture proportion on a map
## Plotting map
quartz()
map("world", fill = TRUE, col = "gray", xlim = c(-130, 50), ylim = c(-20, 70), border = 0, mar = c(0, 5.5, 0, 0))
map.axes(cex.axis = 0.7, las = 1)
mtext(side = 1, line = 2, "Longitude")
mtext(side = 2, line = 2, "Latitude")
## Adding pie charts to the map
for(i in 1:nrow(admix)){
add.pie(z = c(admix$cluster1[i], admix$cluster2[i], admix$cluster3[i], admix$cluster4[i], admix$cluster5[i], admix$cluster6[i], admix$cluster7[i], admix$cluster8[i], admix$cluster9[i]), x = admix$long[i], y = admix$lat[i], radius = 2, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = TRUE)
}
Figure 5. Mean admixture proportions across populations of D. melanogaster surveyed in America and Europe. Colors in the pie charts represent genetic clusters.
Isolation by Environment and Isolation by Distance analyses
unique(fly_gen3$pop)
## [1] Austria Esparto Tuolumne Germany
## [5] Denmark Spain Finland France
## [9] NY-MA MI-WI PA Russia
## [13] Turkey Ukraine Charlottesville
## 15 Levels: Austria MI-WI Charlottesville Denmark Esparto Finland ... Ukraine
obj <- matrix(NA, nrow = length(unique(fly_gen3$pop)), ncol = 3, byrow = TRUE)
idx <- 1
for(i in unique(fly_gen3$pop)){
long <- admix$long[admix$country == i][1]
lat <- admix$lat[admix$country == i][1]
country <- i
obj[idx, ] <- c(country, long, lat)
idx = idx + 1
}
obj
## [,1] [,2] [,3]
## [1,] "Austria" "15.965" "48.2875"
## [2,] "Esparto" "-122.046" "38.68"
## [3,] "Tuolumne" "-120.26" "37.97"
## [4,] "Germany" "9.7147565" "48.199012"
## [5,] "Denmark" "10.212586" "55.945128"
## [6,] "Spain" "1.279236" "41.518208"
## [7,] "Finland" "23.52" "61.1"
## [8,] "France" "3.5442054" "46.8656662"
## [9,] "NY-MA" "-74.085" "42.445"
## [10,] "MI-WI" "-88.04915" "42.61055"
## [11,] "PA" "-76.6350005" "40.3366975"
## [12,] "Russia" "33.235277" "58.02444"
## [13,] "Turkey" "32.260328" "40.231444"
## [14,] "Ukraine" "30.16599445" "49.47387778"
## [15,] "Charlottesville" "-78.47" "38"
obj <- as.data.frame(obj)
colnames(obj) <- c("country", "x", "y")
obj$x <- as.numeric(obj$x)
obj$y <- as.numeric(obj$y)
rownames(obj) <- obj[ , 1]
obj <- obj[ , c(2, 3)]
str(obj)
## 'data.frame': 15 obs. of 2 variables:
## $ x: num 15.96 -122.05 -120.26 9.71 10.21 ...
## $ y: num 48.3 38.7 38 48.2 55.9 ...
fly_gen3@other <- obj[ , c(1, 2)]
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
## @other: a list containing: x y
## genetic distance
GD.pop.PairwiseFst.hierfstat <- as.dist(hierfstat::pairwise.neifst(hierfstat::genind2hierfstat(fly_gen3)))
## geographic distance
Dgeo <- round(dist(fly_gen3$other), 3)
## making sure the order of rows and columns are the same in all of the matrices
ord.gen <- as.matrix(GD.pop.PairwiseFst.hierfstat)
ord.geo <- as.matrix(Dgeo)
ord.geo
## Austria Esparto Tuolumne Germany Denmark Spain Finland
## Austria 0.000 138.345 136.615 6.251 9.578 16.171 14.874
## Esparto 138.345 0.000 1.922 132.104 133.381 123.358 147.282
## Tuolumne 136.615 1.922 0.000 130.377 131.705 121.591 145.629
## Germany 6.251 132.104 130.377 0.000 7.762 10.761 18.895
## Denmark 9.578 133.381 131.705 7.762 0.000 16.969 14.271
## Spain 16.171 123.358 121.591 10.761 16.969 0.000 29.633
## Finland 14.874 147.282 145.629 18.895 14.271 29.633 0.000
## France 12.502 125.857 124.123 6.313 11.265 5.807 24.529
## NY-MA 90.239 48.109 46.391 83.997 85.372 75.370 99.372
## MI-WI 104.169 34.223 32.543 97.924 99.162 89.335 113.091
## PA 92.941 45.441 43.689 86.707 88.239 77.923 102.285
## Russia 19.826 156.482 154.800 25.490 23.116 35.967 10.190
## Turkey 18.178 154.314 152.537 23.912 27.074 31.008 22.625
## Ukraine 14.250 152.594 150.865 20.491 20.977 29.962 13.392
## Charlottesville 94.994 43.581 41.790 88.773 90.480 79.827 104.573
## France NY-MA MI-WI PA Russia Turkey Ukraine
## Austria 12.502 90.239 104.169 92.941 19.826 18.178 14.250
## Esparto 125.857 48.109 34.223 45.441 156.482 154.314 152.594
## Tuolumne 124.123 46.391 32.543 43.689 154.800 152.537 150.865
## Germany 6.313 83.997 97.924 86.707 25.490 23.912 20.491
## Denmark 11.265 85.372 99.162 88.239 23.116 27.074 20.977
## Spain 5.807 75.370 89.335 77.923 35.967 31.008 29.962
## Finland 24.529 99.372 113.091 102.285 10.190 22.625 13.392
## France 0.000 77.755 91.692 80.445 31.719 29.473 26.749
## NY-MA 77.755 0.000 13.965 3.309 108.445 106.368 104.488
## MI-WI 91.692 13.965 0.000 11.638 122.260 120.333 118.414
## PA 80.445 3.309 11.638 0.000 111.285 108.895 107.191
## Russia 31.719 108.445 122.260 111.285 0.000 17.820 9.085
## Turkey 29.473 106.368 120.333 108.895 17.820 0.000 9.477
## Ukraine 26.749 104.488 118.414 107.191 9.085 9.477 0.000
## Charlottesville 82.492 6.244 10.631 2.971 113.486 110.753 109.240
## Charlottesville
## Austria 94.994
## Esparto 43.581
## Tuolumne 41.790
## Germany 88.773
## Denmark 90.480
## Spain 79.827
## Finland 104.573
## France 82.492
## NY-MA 6.244
## MI-WI 10.631
## PA 2.971
## Russia 113.486
## Turkey 110.753
## Ukraine 109.240
## Charlottesville 0.000
ord.gen <- ord.gen[rownames(ord.geo), colnames(ord.geo)]
ord.gen
## Austria Esparto Tuolumne Germany Denmark Spain Finland France
## Austria 0.0000 0.0557 0.0588 0.0128 0.0181 0.0165 0.0151 0.0212
## Esparto 0.0557 0.0000 -0.0013 0.0441 0.0490 0.0446 0.0547 0.0414
## Tuolumne 0.0588 -0.0013 0.0000 0.0468 0.0533 0.0479 0.0601 0.0457
## Germany 0.0128 0.0441 0.0468 0.0000 0.0170 0.0077 0.0136 0.0100
## Denmark 0.0181 0.0490 0.0533 0.0170 0.0000 0.0256 0.0246 0.0186
## Spain 0.0165 0.0446 0.0479 0.0077 0.0256 0.0000 0.0200 0.0158
## Finland 0.0151 0.0547 0.0601 0.0136 0.0246 0.0200 0.0000 0.0222
## France 0.0212 0.0414 0.0457 0.0100 0.0186 0.0158 0.0222 0.0000
## NY-MA 0.0439 0.0089 0.0082 0.0335 0.0403 0.0295 0.0435 0.0334
## MI-WI 0.0540 0.0130 0.0094 0.0412 0.0501 0.0396 0.0492 0.0422
## PA 0.0541 0.0141 0.0099 0.0414 0.0519 0.0393 0.0521 0.0440
## Russia 0.0169 0.0535 0.0605 0.0210 0.0234 0.0318 0.0192 0.0222
## Turkey 0.0139 0.0600 0.0618 0.0188 0.0217 0.0264 0.0163 0.0280
## Ukraine 0.0137 0.0627 0.0674 0.0196 0.0234 0.0308 0.0214 0.0281
## Charlottesville 0.0582 0.0172 0.0153 0.0466 0.0541 0.0466 0.0574 0.0448
## NY-MA MI-WI PA Russia Turkey Ukraine Charlottesville
## Austria 0.0439 0.0540 0.0541 0.0169 0.0139 0.0137 0.0582
## Esparto 0.0089 0.0130 0.0141 0.0535 0.0600 0.0627 0.0172
## Tuolumne 0.0082 0.0094 0.0099 0.0605 0.0618 0.0674 0.0153
## Germany 0.0335 0.0412 0.0414 0.0210 0.0188 0.0196 0.0466
## Denmark 0.0403 0.0501 0.0519 0.0234 0.0217 0.0234 0.0541
## Spain 0.0295 0.0396 0.0393 0.0318 0.0264 0.0308 0.0466
## Finland 0.0435 0.0492 0.0521 0.0192 0.0163 0.0214 0.0574
## France 0.0334 0.0422 0.0440 0.0222 0.0280 0.0281 0.0448
## NY-MA 0.0000 0.0057 0.0017 0.0474 0.0482 0.0535 0.0031
## MI-WI 0.0057 0.0000 0.0042 0.0537 0.0537 0.0616 0.0049
## PA 0.0017 0.0042 0.0000 0.0596 0.0564 0.0646 0.0045
## Russia 0.0474 0.0537 0.0596 0.0000 0.0167 0.0067 0.0597
## Turkey 0.0482 0.0537 0.0564 0.0167 0.0000 0.0129 0.0574
## Ukraine 0.0535 0.0616 0.0646 0.0067 0.0129 0.0000 0.0656
## Charlottesville 0.0031 0.0049 0.0045 0.0597 0.0574 0.0656 0.0000
fst.dist <- round(as.dist(ord.gen), 3)
fst.dist
## Austria Esparto Tuolumne Germany Denmark Spain Finland France
## Esparto 0.056
## Tuolumne 0.059 -0.001
## Germany 0.013 0.044 0.047
## Denmark 0.018 0.049 0.053 0.017
## Spain 0.016 0.045 0.048 0.008 0.026
## Finland 0.015 0.055 0.060 0.014 0.025 0.020
## France 0.021 0.041 0.046 0.010 0.019 0.016 0.022
## NY-MA 0.044 0.009 0.008 0.034 0.040 0.029 0.044 0.033
## MI-WI 0.054 0.013 0.009 0.041 0.050 0.040 0.049 0.042
## PA 0.054 0.014 0.010 0.041 0.052 0.039 0.052 0.044
## Russia 0.017 0.054 0.060 0.021 0.023 0.032 0.019 0.022
## Turkey 0.014 0.060 0.062 0.019 0.022 0.026 0.016 0.028
## Ukraine 0.014 0.063 0.067 0.020 0.023 0.031 0.021 0.028
## Charlottesville 0.058 0.017 0.015 0.047 0.054 0.047 0.057 0.045
## NY-MA MI-WI PA Russia Turkey Ukraine
## Esparto
## Tuolumne
## Germany
## Denmark
## Spain
## Finland
## France
## NY-MA
## MI-WI 0.006
## PA 0.002 0.004
## Russia 0.047 0.054 0.060
## Turkey 0.048 0.054 0.056 0.017
## Ukraine 0.054 0.062 0.065 0.007 0.013
## Charlottesville 0.003 0.005 0.004 0.060 0.057 0.066
Dgeo
## Austria Esparto Tuolumne Germany Denmark Spain Finland
## Esparto 138.345
## Tuolumne 136.615 1.922
## Germany 6.251 132.104 130.377
## Denmark 9.578 133.381 131.705 7.762
## Spain 16.171 123.358 121.591 10.761 16.969
## Finland 14.874 147.282 145.629 18.895 14.271 29.633
## France 12.502 125.857 124.123 6.313 11.265 5.807 24.529
## NY-MA 90.239 48.109 46.391 83.997 85.372 75.370 99.372
## MI-WI 104.169 34.223 32.543 97.924 99.162 89.335 113.091
## PA 92.941 45.441 43.689 86.707 88.239 77.923 102.285
## Russia 19.826 156.482 154.800 25.490 23.116 35.967 10.190
## Turkey 18.178 154.314 152.537 23.912 27.074 31.008 22.625
## Ukraine 14.250 152.594 150.865 20.491 20.977 29.962 13.392
## Charlottesville 94.994 43.581 41.790 88.773 90.480 79.827 104.573
## France NY-MA MI-WI PA Russia Turkey Ukraine
## Esparto
## Tuolumne
## Germany
## Denmark
## Spain
## Finland
## France
## NY-MA 77.755
## MI-WI 91.692 13.965
## PA 80.445 3.309 11.638
## Russia 31.719 108.445 122.260 111.285
## Turkey 29.473 106.368 120.333 108.895 17.820
## Ukraine 26.749 104.488 118.414 107.191 9.085 9.477
## Charlottesville 82.492 6.244 10.631 2.971 113.486 110.753 109.240
NPP_file <- list.files("/Users/dylanpadilla/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dylanpadilla/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)
NPP.data.ext.dis <- extract(NPP.data, obj)
head(NPP.data.ext.dis)
## layer
## [1,] 264522317824
## [2,] 391191265280
## [3,] 367969927168
## [4,] 348927459328
## [5,] 25091205120
## [6,] 268916719616
rownames(NPP.data.ext.dis) <- rownames(obj)
DNPP <- round(dist(log(NPP.data.ext.dis)), 3)
DNPP
## Austria Esparto Tuolumne Germany Denmark Spain Finland France
## Esparto 0.391
## Tuolumne 0.330 0.061
## Germany 0.277 0.114 0.053
## Denmark 2.355 2.747 2.685 2.632
## Spain 0.016 0.375 0.314 0.260 2.372
## Finland 0.610 1.001 0.940 0.886 1.746 0.626
## France 0.577 0.185 0.247 0.300 2.932 0.560 1.186
## NY-MA 0.342 0.050 0.011 0.065 2.697 0.325 0.951 0.235
## MI-WI 0.236 0.155 0.094 0.041 2.592 0.220 0.846 0.340
## PA 0.452 0.061 0.122 0.175 2.808 0.436 1.062 0.124
## Russia 0.549 0.940 0.879 0.826 1.806 0.565 0.061 1.126
## Turkey 0.626 1.017 0.956 0.903 1.729 0.643 0.017 1.203
## Ukraine 0.075 0.466 0.405 0.352 2.280 0.092 0.534 0.652
## Charlottesville 0.526 0.134 0.196 0.249 2.881 0.509 1.135 0.051
## NY-MA MI-WI PA Russia Turkey Ukraine
## Esparto
## Tuolumne
## Germany
## Denmark
## Spain
## Finland
## France
## NY-MA
## MI-WI 0.105
## PA 0.111 0.216
## Russia 0.891 0.785 1.001
## Turkey 0.968 0.862 1.078 0.077
## Ukraine 0.417 0.311 0.527 0.474 0.551
## Charlottesville 0.184 0.290 0.074 1.075 1.152 0.601
## partial mantel test
part.mant <- mantel.partial(fst.dist, DNPP, Dgeo, method = "pearson", permutations = 1000)
part.mant
##
## Partial Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel.partial(xdis = fst.dist, ydis = DNPP, zdis = Dgeo, method = "pearson", permutations = 1000)
##
## Mantel statistic r: 0.3608
## Significance: 0.000999
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.0851 0.1147 0.1445 0.1974
## Permutation: free
## Number of permutations: 1000
part.mant2 <- mantel.partial(fst.dist, Dgeo, DNPP, method = "pearson", permutations = 1000)
part.mant2
##
## Partial Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel.partial(xdis = fst.dist, ydis = Dgeo, zdis = DNPP, method = "pearson", permutations = 1000)
##
## Mantel statistic r: 0.9262
## Significance: 0.000999
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.106 0.142 0.216 0.340
## Permutation: free
## Number of permutations: 1000
## mantel test
ibd <- mantel.randtest(fst.dist, Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = fst.dist, m2 = Dgeo)
##
## Observation: 0.9184113
##
## Based on 999 replicates
## Simulated p-value: 0.001
## Alternative hypothesis: greater
##
## Std.Obs Expectation Variance
## 9.1323592883 0.0004866897 0.0101029459
quartz()
plot(ibd, las = 1, main = "") ## isolation by distance is clearly significant
mantCor <- mgram(fst.dist, Dgeo, nperm = 1000)
mantCor
## $mgram
## lag ngroup mantelr pval llim ulim
## [1,] 9.66 27 0.64245382 0.001 5.911163e-01 0.69701998
## [2,] 28.98 18 0.28493265 0.010 2.269430e-01 0.34959953
## [3,] 48.30 6 0.27711014 0.012 1.790647e-01 0.34965251
## [4,] 67.62 1 0.02464136 0.956 -5.169186e-17 0.03644222
## [5,] 86.94 16 -0.23755747 0.026 -3.447686e-01 -0.14792562
## [6,] 106.26 16 -0.43851519 0.001 -5.476339e-01 -0.37083721
## [7,] 125.58 11 -0.27585827 0.011 -3.444213e-01 -0.19623107
## [8,] 144.90 8 -0.39397231 0.002 -5.044866e-01 -0.27736000
##
## $resids
## [1] NA
##
## attr(,"class")
## [1] "mgram"
mantCor2 <- mgram(fst.dist, DNPP, nperm = 1000)
mantCor2
## $mgram
## lag ngroup mantelr pval llim ulim
## [1,] 0.1825625 44 0.36681725 0.003 2.157538e-01 5.436962e-01
## [2,] 0.5476875 23 -0.07860655 0.291 -2.448150e-01 6.119091e-02
## [3,] 0.9128125 19 -0.36758903 0.004 -5.006758e-01 -2.413466e-01
## [4,] 1.2779375 5 -0.03904606 0.647 -1.879859e-01 9.156261e-02
## [5,] 1.6430625 3 0.09357366 0.325 -1.587588e-17 1.267946e-01
## [6,] 2.0081875 0 0.00000000 1.000 0.000000e+00 0.000000e+00
## [7,] 2.3733125 3 0.10248140 0.277 -4.074927e-17 1.332049e-01
## [8,] 2.7384375 7 -0.15494628 0.066 -2.148389e-01 8.845990e-18
##
## $resids
## [1] NA
##
## attr(,"class")
## [1] "mgram"
plot(mantCor, las = 1)
plot(mantCor2, las = 1)
## filled dots indicate significant correlations. The Mantel correlogram shows that genotypes are relatively similar at short distance, while this similarity decreases with distance. The negative correlations indicate genetic differenciation between some populations
dens <- MASS::kde2d(as.vector(Dgeo), as.vector(fst.dist), n = 300)
myPal <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
dens2 <- MASS::kde2d(as.vector(DNPP), as.vector(fst.dist), n = 300)
myPal2 <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(DNPP), as.vector(fst.dist), pch = 20, xlab = "NPP Distance", ylab = "Genetic Distance", las = 1)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(DNPP)), lwd = 2)
lines(loess.smooth(as.vector(DNPP), as.vector(fst.dist)), col = "red", lwd = 2)
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4), nrow = 4, ncol = 4, byrow = TRUE))
par(mar = c(4, 4.1, 1, 1))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(a)", side = 2, at = 0.07, line = 2.6, font = 2, family = "serif", las = 1)
par(mar = c(4, 4.1, 1, 1))
plot(as.vector(DNPP), as.vector(fst.dist), pch = 20, xlab = "NPP Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(DNPP)), lwd = 2)
lines(loess.smooth(as.vector(DNPP), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(b)", side = 2, at = 0.07, line = 3.1, font = 2, family = "serif", las = 1)
par(mar = c(4, 4.1, 1, 1))
plot(mantCor, las = 1, cex.axis = 1, cex.lab = 1.6)
mtext("(c)", side = 2, at = 0.7, line = 2.6, font = 2, family = "serif", las = 1)
par(mar = c(4, 4.1, 1, 1))
plot(mantCor2, las = 1, cex.axis = 1, cex.lab = 1.6, ylim = c(-0.4, 0.6))
mtext("(d)", side = 2, at = 0.7, line = 3.1, font = 2, family = "serif", las = 1)
Figure 2. Patterns of isolation by distance and isolation by environment according to Partial Mantel tests and Correlograms. (a-b) Geographic and net primary production distances as a function of genetic distance. Colors represent estimated probability densities and the red line a smoothed local mean. (c-d) Mantel correlation at different distance classes as a function of geographic and net primary production distances. Filled dots indicate significant correlations.
Detecting adaptation
## extracting environmental variables from WorldClim and other sources
mat_adap <- admix[ , c(1, 2, 3, 4)]
head(mat_adap)
## country long lat Ind
## 1 Austria 15.965 48.2875 AT_gr_12_fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring
## 3 Austria 15.965 48.2875 AT_Mau_14_01
## 4 Austria 15.965 48.2875 AT_Mau_14_02
## 5 Austria 15.965 48.2875 AT_Mau_15_50
## 6 Austria 15.965 48.2875 AT_Mau_15_51
## extracting NPP
NPP_file <- list.files("/Users/dylanpadilla/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dylanpadilla/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)
prod <- mat_adap[ , c(2, 3)] ## loading the dataframe with the coordinates
head(prod)
## long lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
NPP.data.ext <- extract(NPP.data, prod)
head(NPP.data.ext)
## layer
## [1,] 264522317824
## [2,] 264522317824
## [3,] 264522317824
## [4,] 264522317824
## [5,] 264522317824
## [6,] 264522317824
NPP.data.ext <- as.data.frame(NPP.data.ext)
colnames(NPP.data.ext) <- "NPP"
head(NPP.data.ext)
## NPP
## 1 264522317824
## 2 264522317824
## 3 264522317824
## 4 264522317824
## 5 264522317824
## 6 264522317824
##is.na(NPP.data.ext)
## extracting BIO15 precipitation seasonality (coefficient of variation), BIO4 temperature seasonality (standard deviation * 100)
files <- list.files("/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio", full.names = TRUE)
files
## [1] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_10.tif"
## [2] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_11.tif"
## [3] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_14.tif"
## [4] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_15.tif"
## [5] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_16.tif"
## [6] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_18.tif"
## [7] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_19.tif"
## [8] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_2.tif"
## [9] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_3.tif"
## [10] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_4.tif"
## [11] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_5.tif"
## [12] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_6.tif"
## [13] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_8.tif"
## [14] "/Users/dylanpadilla/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_9.tif"
pre_sea <- stack(files[4])
tem_sea <- stack(files[10])
coord <- mat_adap[ , c(2, 3)]
head(coord)
## long lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
prec.season <- extract(pre_sea, coord)
temp.season <- extract(tem_sea, coord)
variables <- cbind(prec.season, temp.season, log(NPP.data.ext))
head(variables)
## layer layer NPP
## 1 37.24262 744.4373 26.30119
## 2 37.24262 744.4373 26.30119
## 3 37.24262 744.4373 26.30119
## 4 37.24262 744.4373 26.30119
## 5 37.24262 744.4373 26.30119
## 6 37.24262 744.4373 26.30119
colnames(variables) <- c("precseason", "tempseason", "logNPP")
head(variables)
## precseason tempseason logNPP
## 1 37.24262 744.4373 26.30119
## 2 37.24262 744.4373 26.30119
## 3 37.24262 744.4373 26.30119
## 4 37.24262 744.4373 26.30119
## 5 37.24262 744.4373 26.30119
## 6 37.24262 744.4373 26.30119
env <- cbind(mat_adap, variables)
head(env)
## country long lat Ind precseason tempseason logNPP
## 1 Austria 15.965 48.2875 AT_gr_12_fall 37.24262 744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring 37.24262 744.4373 26.30119
## 3 Austria 15.965 48.2875 AT_Mau_14_01 37.24262 744.4373 26.30119
## 4 Austria 15.965 48.2875 AT_Mau_14_02 37.24262 744.4373 26.30119
## 5 Austria 15.965 48.2875 AT_Mau_15_50 37.24262 744.4373 26.30119
## 6 Austria 15.965 48.2875 AT_Mau_15_51 37.24262 744.4373 26.30119
str(env)
## 'data.frame': 151 obs. of 7 variables:
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ long : num 16 16 16 16 16 ...
## $ lat : num 48.3 48.3 48.3 48.3 48.3 ...
## $ Ind : chr "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
## $ precseason: num 37.2 37.2 37.2 37.2 37.2 ...
## $ tempseason: num 744 744 744 744 744 ...
## $ logNPP : num 26.3 26.3 26.3 26.3 26.3 ...
## confirm that genotypes and environmental data are in the same order
x <- tab(fly_gen3, NA.method = "mean")
snppca <- as.data.frame(x)
snppca$Ind <- rownames(x)
snppca$country <- fly_gen3$pop
dim(snppca)
## [1] 151 3102
gen <- snppca[order(snppca$country), ]
merging <- merge(gen, env, by = "Ind")
merging[1:5, 1:5]
## Ind 162333.A 162333.C 162335.C 162335.T
## 1 AT_gr_12_fall 1 1 1 1
## 2 AT_gr_12_spring 1 1 1 1
## 3 AT_Mau_14_01 0 2 1 1
## 4 AT_Mau_14_02 1 1 1 1
## 5 AT_Mau_15_50 1 1 1 1
rownames(merging) <- merging[ , 1]
dim(merging)
## [1] 151 3108
merging[1:5, 3101:3108]
## 163997.T country.x country.y long lat precseason
## AT_gr_12_fall 0 Austria Austria 15.965 48.2875 37.24262
## AT_gr_12_spring 0 Austria Austria 15.965 48.2875 37.24262
## AT_Mau_14_01 0 Austria Austria 15.965 48.2875 37.24262
## AT_Mau_14_02 1 Austria Austria 15.965 48.2875 37.24262
## AT_Mau_15_50 0 Austria Austria 15.965 48.2875 37.24262
## tempseason logNPP
## AT_gr_12_fall 744.4373 26.30119
## AT_gr_12_spring 744.4373 26.30119
## AT_Mau_14_01 744.4373 26.30119
## AT_Mau_14_02 744.4373 26.30119
## AT_Mau_15_50 744.4373 26.30119
merging <- merging[ , -c(1, 3102, 3103, 3104, 3105, 3106, 3107, 3108)]
merging[1:5, 3085:3100]
## 163989.A 163989.C 163991.C 163991.G 163992.A 163992.G 163993.T
## AT_gr_12_fall 1 1 1 1 2 0 1
## AT_gr_12_spring 2 0 1 1 2 0 1
## AT_Mau_14_01 1 1 2 0 1 1 1
## AT_Mau_14_02 1 1 2 0 2 0 1
## AT_Mau_15_50 1 1 1 1 1 1 1
## 163993.C 163994.G 163994.A 163995.G 163995.A 163996.G 163996.C
## AT_gr_12_fall 1 1 1 2 0 1 1
## AT_gr_12_spring 1 1 1 2 0 1 1
## AT_Mau_14_01 1 1 1 2 0 1 1
## AT_Mau_14_02 1 1 1 2 0 1 1
## AT_Mau_15_50 1 1 1 2 0 1 1
## 163997.A 163997.T
## AT_gr_12_fall 2 0
## AT_gr_12_spring 2 0
## AT_Mau_14_01 2 0
## AT_Mau_14_02 1 1
## AT_Mau_15_50 2 0
dim(merging)
## [1] 151 3100
genomic <- merging
env$Ind <- as.factor(env$Ind)
env$Ind <- as.character(levels(env$Ind))
dim(snppca)
## [1] 151 3102
snppca[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
snppca[1:5, 3085:3102]
## 163989.A 163989.C 163991.C 163991.G 163992.A 163992.G 163993.T
## AT_gr_12_fall 1 1 1 1 2 0 1
## AT_gr_12_spring 2 0 1 1 2 0 1
## AT_Mau_14_01 1 1 2 0 1 1 1
## AT_Mau_14_02 1 1 2 0 2 0 1
## AT_Mau_15_50 1 1 1 1 1 1 1
## 163993.C 163994.G 163994.A 163995.G 163995.A 163996.G 163996.C
## AT_gr_12_fall 1 1 1 2 0 1 1
## AT_gr_12_spring 1 1 1 2 0 1 1
## AT_Mau_14_01 1 1 1 2 0 1 1
## AT_Mau_14_02 1 1 1 2 0 1 1
## AT_Mau_15_50 1 1 1 2 0 1 1
## 163997.A 163997.T Ind country
## AT_gr_12_fall 2 0 AT_gr_12_fall Austria
## AT_gr_12_spring 2 0 AT_gr_12_spring Austria
## AT_Mau_14_01 2 0 AT_Mau_14_01 Austria
## AT_Mau_14_02 1 1 AT_Mau_14_02 Austria
## AT_Mau_15_50 2 0 AT_Mau_15_50 Austria
snppca <- snppca[ , -c(3101, 3102)]
snppca[1:5, 3085:3100]
## 163989.A 163989.C 163991.C 163991.G 163992.A 163992.G 163993.T
## AT_gr_12_fall 1 1 1 1 2 0 1
## AT_gr_12_spring 2 0 1 1 2 0 1
## AT_Mau_14_01 1 1 2 0 1 1 1
## AT_Mau_14_02 1 1 2 0 2 0 1
## AT_Mau_15_50 1 1 1 1 1 1 1
## 163993.C 163994.G 163994.A 163995.G 163995.A 163996.G 163996.C
## AT_gr_12_fall 1 1 1 2 0 1 1
## AT_gr_12_spring 1 1 1 2 0 1 1
## AT_Mau_14_01 1 1 1 2 0 1 1
## AT_Mau_14_02 1 1 1 2 0 1 1
## AT_Mau_15_50 1 1 1 2 0 1 1
## 163997.A 163997.T
## AT_gr_12_fall 2 0
## AT_gr_12_spring 2 0
## AT_Mau_14_01 2 0
## AT_Mau_14_02 1 1
## AT_Mau_15_50 2 0
dim(snppca)
## [1] 151 3100
identical(rownames(genomic), env[ , 4]) ## genotypes and environmental data are in the same order
## [1] TRUE
names(env)
## [1] "country" "long" "lat" "Ind" "precseason"
## [6] "tempseason" "logNPP"
env.pca <- rda(env[ , c(2, 3, 5, 6, 7)], scale = T)
summary(env.pca)$cont
## $importance
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Eigenvalue 2.1427 1.2189 0.8322 0.48306 0.32307
## Proportion Explained 0.4285 0.2438 0.1664 0.09661 0.06461
## Cumulative Proportion 0.4285 0.6723 0.8388 0.93539 1.00000
quartz()
par(las = 1)
screeplot(env.pca, main = " ")
round(scores(env.pca, choices = 1:5, display = "species", scaling = 0), digits = 3)
## PC1 PC2 PC3 PC4 PC5
## long -0.592 -0.002 -0.149 -0.421 0.671
## lat -0.598 0.068 0.080 -0.336 -0.720
## precseason 0.127 -0.689 0.635 -0.321 0.049
## tempseason 0.004 0.681 0.716 -0.022 0.151
## logNPP 0.525 0.236 -0.236 -0.779 -0.077
## attr(,"const")
## [1] 5.233176
pred.PC1 <- scores(env.pca, choices = 1, display = "sites", scaling = 0)
## determine K
## I’ll use a broken stick criterion to determine K
par(las = 1)
screeplot(env.pca, bstick = TRUE, type = "lines", las = 1, main = "")
## genomic data
gen.pca <- rda(snppca, scale = T)
par(las = 1)
screeplot(gen.pca, main = " ", bstick = TRUE, type = "lines")
## for the genomic data, I can see that two of the PCs have eigenvalues greater than random (greater than the broken stick values in red). This effectively means that K = 2 for the dataset, based on a PCA assessment
K <- 2:9
fly.lfmm1 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[1]) ## c.ange K as you see fit
fly.lfmm2 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[2]) ## c.ange K as you see fit
fly.lfmm3 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[3]) ## c.ange K as you see fit
fly.lfmm4 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[4]) ## c.ange K as you see fit
fly.lfmm5 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[5]) ## c.ange K as you see fit
fly.lfmm6 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[6]) ## c.ange K as you see fit
fly.lfmm7 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[7]) ## c.ange K as you see fit
fly.lfmm8 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K[8]) ## c.ange K as you see fit
fly.pv1 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm1, calibrate = "gif")
fly.pv2 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm2, calibrate = "gif")
fly.pv3 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm3, calibrate = "gif")
fly.pv4 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm4, calibrate = "gif")
fly.pv5 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm5, calibrate = "gif")
fly.pv6 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm6, calibrate = "gif")
fly.pv7 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm7, calibrate = "gif")
fly.pv8 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm8, calibrate = "gif")
names(fly.pv1) # this object includes raw z-scores and p-values, as well as GIF-calibrated scores and p-values
## [1] "B" "epsilon.sigma2" "B.sigma2"
## [4] "score" "pvalue" "gif"
## [7] "calibrated.score2" "calibrated.pvalue"
## let’s look at the genomic inflation factor (GIF)
fly.pv1$gif
## PC1
## 1.522458
fly.pv2$gif
## PC1
## 1.354105
fly.pv3$gif
## PC1
## 1.378276
fly.pv4$gif
## PC1
## 1.307732
fly.pv5$gif
## PC1
## 1.286765
fly.pv6$gif
## PC1
## 1.310563
fly.pv7$gif
## PC1
## 1.314588
fly.pv8$gif
## PC1
## 1.323961
## an appropriately calibrated set of tests will have a GIF of around 1
## let’s look at how application of the GIF to the p-values impacts the p-value distribution
quartz()
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4,
5, 5, 6, 6,
5, 5, 6, 6,
7, 7, 8, 8,
7, 7, 8, 8), nrow = 8, ncol = 4, byrow = TRUE))
par(mar = c(5, 5, 0.2, 1), las = 1)
hist(fly.pv1$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 2", bty = "n")
hist(fly.pv1$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 2", bty = "n")
hist(fly.pv2$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 3", bty = "n")
hist(fly.pv2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 3", bty = "n")
hist(fly.pv3$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 4", bty = "n")
hist(fly.pv3$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 4", bty = "n")
hist(fly.pv4$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 5", bty = "n")
hist(fly.pv4$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 5", bty = "n")
quartz()
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4,
5, 5, 6, 6,
5, 5, 6, 6,
7, 7, 8, 8,
7, 7, 8, 8), nrow = 8, ncol = 4, byrow = TRUE))
par(mar = c(5, 5, 0.2, 1), las = 1)
hist(fly.pv5$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 6", bty = "n")
hist(fly.pv5$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 6", bty = "n")
hist(fly.pv6$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 7", bty = "n")
hist(fly.pv6$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 7", bty = "n")
hist(fly.pv7$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 8", bty = "n")
hist(fly.pv7$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 8", bty = "n")
hist(fly.pv8$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 500))
legend("top", "K = 9", bty = "n")
hist(fly.pv8$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", "K = 9", bty = "n")
## convert the adjusted p-values to q-values. q-values provide a measure of each SNP’s significance, automatically taking into account the fact that thousands are simultaneously being tested. I can then use an FDR threshold to control the number of false positive detections (given that the p-value distribution is “well-behaved”)
fly.qv1 <- qvalue(fly.pv1$calibrated.pvalue)$qvalues
length(which(fly.qv1 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 0
fly.qv2 <- qvalue(fly.pv2$calibrated.pvalue)$qvalues
length(which(fly.qv2 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 0
fly.qv3 <- qvalue(fly.pv3$calibrated.pvalue)$qvalues
length(which(fly.qv3 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 0
fly.qv4 <- qvalue(fly.pv4$calibrated.pvalue)$qvalues
length(which(fly.qv4 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 0
fly.qv5 <- qvalue(fly.pv5$calibrated.pvalue)$qvalues
length(which(fly.qv5 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
fly.qv6 <- qvalue(fly.pv6$calibrated.pvalue)$qvalues
length(which(fly.qv5 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
fly.qv7 <- qvalue(fly.pv7$calibrated.pvalue)$qvalues
length(which(fly.qv5 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
fly.qv8 <- qvalue(fly.pv8$calibrated.pvalue)$qvalues
length(which(fly.qv5 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
## using K from 2 to 9, the default GIF correction, and an FDR threshold of 0.05, I detected 2 candidate SNPs under selection in response to the PC1 environmental predictor
(fly.FDR.5 <- colnames(snppca)[which(fly.qv5 < 0.05)]) ## identify which SNPs these are
## [1] "163005.A" "163005.G" "163496.G" "163496.A"
## visualizing results
## Manhattan plot with causal loci
qvalues <- fly.qv5
row <- as.data.frame(qvalues)
str(row)
## 'data.frame': 3100 obs. of 1 variable:
## $ PC1: num 0.959 0.959 0.846 0.846 0.997 ...
rownames(row) <- 1:3100
(causal.set <- as.numeric(rownames(row)[which(row < 0.05)]))
## [1] 1251 1252 2161 2162
(fly.FDR.5 <- gsub("[^0-9]", "", fly.FDR.5))
## [1] "163005" "163005" "163496" "163496"
quartz()
plot(-log10(qvalues), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1)
points(causal.set, -log10(qvalues)[causal.set], pch = 19, col = alpha("red", 0.3))
text(causal.set, (-log10(qvalues)[causal.set] - 0.05), fly.FDR.5, col = "red")
quartz()
layout(matrix(c(0, 1, 1, 0,
0, 1, 1, 0,
0, 2, 2, 0,
0, 2, 2, 0), nrow = 4, ncol = 4, byrow = TRUE))
par(las = 1, mar = c(4, 4, 1, 0.6))
hist(fly.pv5$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "", ylim = c(0, 350))
legend("top", paste("K = 6\nGIF = ", round(fly.pv5$gif, 2)), bty = "n")
mtext("(a)", side = 2, at = 370, line = 2.5, las = 1, font = 2, family = "serif")
plot(-log10(qvalues), pch = 19, cex = 1, bg = "gray", col = alpha("black", 0.1), xlab = "SNP", las = 1)
points(causal.set, -log10(qvalues)[causal.set], pch = 19, col = alpha("red", 0.3))
text(causal.set, (-log10(qvalues)[causal.set] - 0.05), fly.FDR.5, col = "red")
mtext("(b)", side = 2, at = 1.45, line = 2.5, las = 1, font = 2, family = "serif")
Figure 6. Genotype-environment association test based on a latent factor mixed model with ridge penalty. (a) Distribution of adjusted p-values using the default genomic inflation factor. (b) Manhattan plot of loci (SNPs) potentially affected by the PC1 predictor variable. Loci highlighted in red were considered to be under selection according to a False Discovery Rate of 0.05.